mirror of
https://github.com/apache/impala.git
synced 2025-12-23 21:08:39 -05:00
Sequence File Scanner
This commit is contained in:
@@ -2,7 +2,6 @@
|
||||
|
||||
#include <boost/algorithm/string/join.hpp>
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include "common/status.h"
|
||||
|
||||
using namespace std;
|
||||
@@ -10,27 +9,8 @@ using namespace boost::algorithm;
|
||||
|
||||
namespace impala {
|
||||
|
||||
struct Status::ErrorDetail {
|
||||
vector<string> error_msgs;
|
||||
|
||||
ErrorDetail(const string& msg): error_msgs(1, msg) {}
|
||||
ErrorDetail(const vector<string>& msgs): error_msgs(msgs) {}
|
||||
};
|
||||
|
||||
const Status Status::OK;
|
||||
|
||||
Status::Status(const string& error_msg)
|
||||
: error_detail_(new ErrorDetail(error_msg)) {
|
||||
LOG(ERROR) << "Error Status: " << error_msg;
|
||||
}
|
||||
|
||||
Status::Status(const Status& status)
|
||||
: error_detail_(
|
||||
status.error_detail_ != NULL
|
||||
? new ErrorDetail(*status.error_detail_)
|
||||
: NULL) {
|
||||
}
|
||||
|
||||
Status& Status::operator=(const Status& status) {
|
||||
delete error_detail_;
|
||||
if (status.error_detail_ == NULL) {
|
||||
@@ -56,10 +36,6 @@ Status& Status::operator=(const TStatus& status) {
|
||||
return *this;
|
||||
}
|
||||
|
||||
Status::~Status() {
|
||||
if (error_detail_ != NULL) delete error_detail_;
|
||||
}
|
||||
|
||||
void Status::GetErrorMsgs(vector<string>* msgs) const {
|
||||
msgs->clear();
|
||||
if (error_detail_ != NULL) {
|
||||
|
||||
@@ -6,6 +6,7 @@
|
||||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include <glog/logging.h>
|
||||
#include "common/compiler-util.h"
|
||||
#include "gen-cpp/Types_types.h" // for TStatus
|
||||
|
||||
@@ -34,11 +35,24 @@ class Status {
|
||||
|
||||
static const Status OK;
|
||||
|
||||
// c'tor for error case
|
||||
Status(const std::string& error_msg);
|
||||
|
||||
// copy c'tor makes copy of error detail so Status can be returned by value
|
||||
Status(const Status& status);
|
||||
Status(const Status& status)
|
||||
: error_detail_(
|
||||
status.error_detail_ != NULL
|
||||
? new ErrorDetail(*status.error_detail_)
|
||||
: NULL) {
|
||||
}
|
||||
|
||||
// c'tor for error case
|
||||
Status(const std::string& error_msg)
|
||||
: error_detail_(new ErrorDetail(error_msg)) {
|
||||
LOG(WARNING) << "Error Status: " << error_msg;
|
||||
}
|
||||
|
||||
~Status() {
|
||||
if (error_detail_ != NULL) delete error_detail_;
|
||||
}
|
||||
|
||||
|
||||
// same as copy c'tor
|
||||
Status& operator=(const Status& status);
|
||||
@@ -53,8 +67,6 @@ class Status {
|
||||
// assign from stringstream
|
||||
Status& operator=(const std::stringstream& stream);
|
||||
|
||||
~Status();
|
||||
|
||||
bool ok() const { return error_detail_ == NULL; }
|
||||
|
||||
void AddErrorMsg(const std::string& msg);
|
||||
@@ -71,7 +83,13 @@ class Status {
|
||||
std::string GetErrorMsg() const;
|
||||
|
||||
private:
|
||||
struct ErrorDetail;
|
||||
struct ErrorDetail {
|
||||
std::vector<std::string> error_msgs;
|
||||
|
||||
ErrorDetail(const std::string& msg): error_msgs(1, msg) {}
|
||||
ErrorDetail(const std::vector<std::string>& msgs): error_msgs(msgs) {}
|
||||
};
|
||||
|
||||
ErrorDetail* error_detail_;
|
||||
};
|
||||
|
||||
|
||||
@@ -10,6 +10,8 @@ set(EXECUTABLE_OUTPUT_PATH "${BUILD_OUTPUT_ROOT_DIRECTORY}/exec")
|
||||
|
||||
add_library(Exec STATIC
|
||||
aggregation-node.cc
|
||||
buffered-byte-stream.cc
|
||||
delimited-text-parser.cc
|
||||
exec-node.cc
|
||||
exchange-node.cc
|
||||
hash-join-node.cc
|
||||
@@ -18,6 +20,7 @@ add_library(Exec STATIC
|
||||
hdfs-scanner.cc
|
||||
hash-table.cc
|
||||
hdfs-rcfile-scanner.cc
|
||||
hdfs-sequence-scanner.cc
|
||||
hdfs-text-scanner.cc
|
||||
hbase-scan-node.cc
|
||||
hbase-table-scanner.cc
|
||||
@@ -34,4 +37,5 @@ target_link_libraries(Exec
|
||||
target_link_libraries(Exec
|
||||
${JAVA_JVM_LIBRARY}
|
||||
${HDFS_LIBS}
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
93
be/src/exec/buffered-byte-stream.cc
Normal file
93
be/src/exec/buffered-byte-stream.cc
Normal file
@@ -0,0 +1,93 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#include "exec/buffered-byte-stream.h"
|
||||
#include "common/status.h"
|
||||
#include <glog/logging.h>
|
||||
#include <sstream>
|
||||
|
||||
using namespace impala;
|
||||
using namespace std;
|
||||
|
||||
BufferedByteStream::BufferedByteStream(ByteStream* parent,
|
||||
int64_t buffer_size,
|
||||
RuntimeProfile::Counter* timer)
|
||||
: parent_byte_stream_(parent),
|
||||
mem_pool_(new MemPool()),
|
||||
byte_buffer_size_(buffer_size),
|
||||
byte_buffer_(mem_pool_->Allocate(byte_buffer_size_)),
|
||||
byte_offset_(0),
|
||||
byte_buffer_start_(0),
|
||||
byte_buffer_len_(0),
|
||||
scanner_timer_(timer) {
|
||||
}
|
||||
|
||||
Status BufferedByteStream::GetPosition(int64_t* position) {
|
||||
*position = byte_buffer_start_ + byte_offset_;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status BufferedByteStream::Open(const string& location) {
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status BufferedByteStream::Read(char* buf, int64_t req_len, int64_t* actual_len) {
|
||||
DCHECK(buf != NULL);
|
||||
DCHECK_GE(req_len, 0);
|
||||
|
||||
int number_bytes_read = 0;
|
||||
if (req_len <= byte_buffer_len_ - byte_offset_) {
|
||||
memcpy(buf, byte_buffer_ + byte_offset_, req_len);
|
||||
number_bytes_read = req_len;
|
||||
byte_offset_ += number_bytes_read;
|
||||
} else {
|
||||
while (number_bytes_read < req_len) {
|
||||
int copy_len = min(byte_buffer_len_ - byte_offset_, req_len - number_bytes_read);
|
||||
memcpy(buf + number_bytes_read, byte_buffer_ + byte_offset_, copy_len);
|
||||
number_bytes_read += copy_len;
|
||||
byte_offset_ += copy_len;
|
||||
if (byte_offset_ == byte_buffer_len_) {
|
||||
byte_buffer_start_ += byte_buffer_len_;
|
||||
{
|
||||
if (scanner_timer_ != NULL) {
|
||||
COUNTER_SCOPED_TIMER(scanner_timer_);
|
||||
}
|
||||
RETURN_IF_ERROR(parent_byte_stream_->Read(
|
||||
byte_buffer_, byte_buffer_size_, &byte_buffer_len_));
|
||||
}
|
||||
byte_offset_ = 0;
|
||||
|
||||
if (byte_buffer_len_ == 0) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
*actual_len = number_bytes_read;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status BufferedByteStream::Close() {
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status BufferedByteStream::Seek(int64_t offset) {
|
||||
if (offset >= byte_buffer_start_ && offset < byte_buffer_start_ + byte_buffer_len_) {
|
||||
byte_offset_ = offset - byte_buffer_start_;
|
||||
} else {
|
||||
RETURN_IF_ERROR(parent_byte_stream_->Seek(offset));
|
||||
byte_buffer_start_ = offset;
|
||||
byte_buffer_len_ = 0;
|
||||
byte_offset_ = 0;
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status BufferedByteStream::Eof(bool* eof) {
|
||||
if (byte_offset_ < byte_buffer_len_) {
|
||||
*eof = false;
|
||||
return Status::OK;
|
||||
}
|
||||
RETURN_IF_ERROR(SyncParent());
|
||||
RETURN_IF_ERROR(parent_byte_stream_->Eof(eof));
|
||||
return Status::OK;
|
||||
}
|
||||
74
be/src/exec/buffered-byte-stream.h
Normal file
74
be/src/exec/buffered-byte-stream.h
Normal file
@@ -0,0 +1,74 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#ifndef IMPALA_EXEC_BUFFERED_BYTE_STREAM_H_
|
||||
#define IMPALA_EXEC_BUFFERED_BYTE_STREAM_H_
|
||||
|
||||
#include <string>
|
||||
#include <hdfs.h>
|
||||
#include <boost/scoped_ptr.hpp>
|
||||
|
||||
#include "util/runtime-profile.h"
|
||||
#include "exec/byte-stream.h"
|
||||
#include "runtime/mem-pool.h"
|
||||
#include "common/status.h"
|
||||
|
||||
namespace impala {
|
||||
|
||||
// A Buffered ByteStream implementation.
|
||||
// This class provides buffered reads from the underlying parent byte stream.
|
||||
// TODO: This is needed because of the way SerDeUtils work, we should revisit this.
|
||||
class BufferedByteStream : public ByteStream {
|
||||
public:
|
||||
BufferedByteStream(ByteStream* parent,
|
||||
int64_t buffer_size, RuntimeProfile::Counter* timer = NULL);
|
||||
|
||||
virtual Status Open(const std::string& location);
|
||||
virtual Status Close();
|
||||
virtual Status Read(char *buf, int64_t req_len, int64_t* actual_len);
|
||||
virtual Status Seek(int64_t offset);
|
||||
virtual Status GetPosition(int64_t* position);
|
||||
virtual Status Eof(bool* eof);
|
||||
|
||||
// Set the parent offset to our current position.
|
||||
Status SyncParent() {
|
||||
RETURN_IF_ERROR(parent_byte_stream_->Seek(byte_buffer_start_ + byte_offset_));
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Set our posistion to where the parent is.
|
||||
Status SeekToParent() {
|
||||
int64_t position;
|
||||
RETURN_IF_ERROR(parent_byte_stream_->GetPosition(&position));
|
||||
RETURN_IF_ERROR(Seek(position));
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
private:
|
||||
// Pointer to the source byte stream.
|
||||
ByteStream* parent_byte_stream_;
|
||||
|
||||
// Memory pool to allocate buffers.
|
||||
boost::scoped_ptr<MemPool> mem_pool_;
|
||||
|
||||
// Size of the buffer.
|
||||
int64_t byte_buffer_size_;
|
||||
|
||||
// Buffer containing bytes.
|
||||
char* byte_buffer_;
|
||||
|
||||
// Current offset within buffer.
|
||||
int64_t byte_offset_;
|
||||
|
||||
// Posistion of start of buffer in parent byte stream.
|
||||
int64_t byte_buffer_start_;
|
||||
|
||||
// Amount of data in buffer.
|
||||
int64_t byte_buffer_len_;
|
||||
|
||||
RuntimeProfile::Counter* scanner_timer_;
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
#endif
|
||||
@@ -13,6 +13,8 @@ class Status;
|
||||
// A simple wrapper around sources of byte data
|
||||
class ByteStream {
|
||||
public:
|
||||
virtual ~ByteStream() { }
|
||||
|
||||
// Opens a resource from supplied location, ready for reading
|
||||
virtual Status Open(const std::string& location) = 0;
|
||||
|
||||
@@ -29,6 +31,9 @@ class ByteStream {
|
||||
// Returns the position of the stream cursor
|
||||
virtual Status GetPosition(int64_t* position) = 0;
|
||||
|
||||
// Returns if the stream is at EOF
|
||||
virtual Status Eof(bool* eof) = 0;
|
||||
|
||||
// Returns the name of the resource backing this stream
|
||||
const std::string& GetLocation() { return location_; };
|
||||
|
||||
|
||||
337
be/src/exec/delimited-text-parser.cc
Normal file
337
be/src/exec/delimited-text-parser.cc
Normal file
@@ -0,0 +1,337 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#include "util/cpu-info.h"
|
||||
#include "exec/delimited-text-parser.h"
|
||||
|
||||
using namespace impala;
|
||||
using namespace std;
|
||||
|
||||
void DelimitedTextParser::ParserReset() {
|
||||
current_column_has_escape_ = false;
|
||||
last_char_is_escape_ = false;
|
||||
column_idx_ = start_column_;
|
||||
}
|
||||
|
||||
DelimitedTextParser::DelimitedTextParser(const vector<int>& map_column_to_slot,
|
||||
int start_column,
|
||||
RuntimeProfile::Counter* timer,
|
||||
char tuple_delim,
|
||||
char field_delim,
|
||||
char collection_item_delim,
|
||||
char escape_char)
|
||||
: map_column_to_slot_(map_column_to_slot),
|
||||
start_column_(start_column),
|
||||
parse_time_counter_(timer),
|
||||
field_delim_(field_delim),
|
||||
escape_char_(escape_char),
|
||||
collection_item_delim_(collection_item_delim),
|
||||
tuple_delim_(tuple_delim) {
|
||||
|
||||
// Initialize the sse search registers.
|
||||
// TODO: is this safe to do in here? Not sure if the compiler/system
|
||||
// will manage these registers for us.
|
||||
char tmp[SSEUtil::CHARS_PER_128_BIT_REGISTER];
|
||||
memset(tmp, 0, sizeof(tmp));
|
||||
if (tuple_delim_ != '\0') {
|
||||
tmp[0] = tuple_delim_;
|
||||
xmm_tuple_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
}
|
||||
if (escape_char_ != '\0') {
|
||||
tmp[0] = escape_char_;
|
||||
xmm_escape_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
}
|
||||
tmp[0] = field_delim_;
|
||||
tmp[1] = collection_item_delim_;
|
||||
xmm_field_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
|
||||
column_idx_ = start_column_;
|
||||
current_column_has_escape_ = false;
|
||||
last_char_is_escape_ = false;
|
||||
}
|
||||
|
||||
|
||||
// Updates the values in the field and tuple masks, escaping them if necessary.
|
||||
// If the character at n is an escape character, then delimiters(tuple/field/escape
|
||||
// characters) at n+1 don't count.
|
||||
inline void ProcessEscapeMask(int escape_mask, bool* last_char_is_escape,
|
||||
int* field_mask, int* tuple_mask) {
|
||||
// Escape characters can escape escape characters.
|
||||
bool first_char_is_escape = *last_char_is_escape;
|
||||
bool escape_next = first_char_is_escape;
|
||||
for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
|
||||
if (escape_next) {
|
||||
escape_mask &= ~SSEUtil::SSE_BITMASK[i];
|
||||
}
|
||||
escape_next = escape_mask & SSEUtil::SSE_BITMASK[i];
|
||||
}
|
||||
|
||||
// Remember last character for the next iteration
|
||||
*last_char_is_escape = escape_mask &
|
||||
SSEUtil::SSE_BITMASK[SSEUtil::CHARS_PER_128_BIT_REGISTER - 1];
|
||||
|
||||
// Shift escape mask up one so they match at the same bit index as the tuple and
|
||||
// field mask (instead of being the character before) and set the correct first bit
|
||||
escape_mask = escape_mask << 1 | first_char_is_escape;
|
||||
|
||||
// If escape_mask[n] is true, then tuple/field_mask[n] is escaped
|
||||
*tuple_mask &= ~escape_mask;
|
||||
*field_mask &= ~escape_mask;
|
||||
}
|
||||
|
||||
inline void DelimitedTextParser::AddColumn(int len,
|
||||
char** next_column_start, int* num_fields,
|
||||
vector<DelimitedTextParser::FieldLocation>* field_locations) {
|
||||
if (ReturnCurrentColumn()) {
|
||||
DCHECK_LT(*num_fields, field_locations->size());
|
||||
// Found a column that needs to be parsed, write the start/len to 'parsed_data_'
|
||||
(*field_locations)[*num_fields].start = *next_column_start;
|
||||
(*field_locations)[*num_fields].len = len;
|
||||
if (current_column_has_escape_) (*field_locations)[*num_fields].len *= -1;
|
||||
++(*num_fields);
|
||||
}
|
||||
current_column_has_escape_ = false;
|
||||
*next_column_start += len + 1;
|
||||
++column_idx_;
|
||||
}
|
||||
|
||||
// SSE optimized raw text file parsing. SSE4_2 added an instruction (with 3 modes) for
|
||||
// text processing. The modes mimic strchr, strstr and strcmp. For text parsing, we can
|
||||
// leverage the strchr functionality.
|
||||
//
|
||||
// The instruction operates on two sse registers:
|
||||
// - the needle (what you are searching for)
|
||||
// - the haystack (where you are searching in)
|
||||
// Both registers can contain up to 16 characters. The result is a 16-bit mask with a bit
|
||||
// set for each character in the haystack that matched any character in the needle.
|
||||
// For example:
|
||||
// Needle = 'abcd000000000000' (we're searching for any a's, b's, c's d's)
|
||||
// Haystack = 'asdfghjklhjbdwwc' (the raw string)
|
||||
// Result = '101000000001101'
|
||||
Status DelimitedTextParser::ParseFieldLocations(int max_tuples, int64_t remaining_len,
|
||||
char** byte_buffer_ptr, std::vector<FieldLocation>* field_locations,
|
||||
int* num_tuples, int* num_fields, char** next_column_start) {
|
||||
if (parse_time_counter_ != NULL)
|
||||
COUNTER_SCOPED_TIMER(parse_time_counter_);
|
||||
|
||||
// Start of this batch.
|
||||
*next_column_start = *byte_buffer_ptr;
|
||||
|
||||
// To parse using SSE, we:
|
||||
// 1. Load into different sse registers the different characters we need to search for
|
||||
// tuple breaks, field breaks, escape characters
|
||||
// 2. Load 16 characters at a time into the sse register
|
||||
// 3. Use the SSE instruction to do strchr on those 16 chars, the result is a bitmask
|
||||
// 4. Compute the bitmask for tuple breaks, field breaks and escape characters.
|
||||
// 5. If there are escape characters, fix up the matching masked bits in the
|
||||
// field/tuple mask
|
||||
// 6. Go through the mask bit by bit and write the parsed data.
|
||||
|
||||
// xmm registers:
|
||||
// - xmm_buffer: the register holding the current (16 chars) we're working on from the
|
||||
// file
|
||||
// - xmm_tuple_search_: the tuple search register. Only contains the tuple_delim char.
|
||||
// - xmm_field_search_: the field search register. Contains field delim and
|
||||
// collection_item delim_char
|
||||
// - xmm_escape_search_: the escape search register. Only contains escape char
|
||||
// - xmm_tuple_mask: the result of doing strchr for the tuple delim
|
||||
// - xmm_field_mask: the result of doing strchr for the field delim
|
||||
// - xmm_escape_mask: the result of doing strchr for the escape char
|
||||
__m128i xmm_buffer, xmm_tuple_mask, xmm_field_mask, xmm_escape_mask;
|
||||
|
||||
if (CpuInfo::Instance()->IsSupported(CpuInfo::SSE4_2)) {
|
||||
while (remaining_len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
|
||||
// Load the next 16 bytes into the xmm register
|
||||
xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(*byte_buffer_ptr));
|
||||
|
||||
// Do the strchr for tuple and field breaks
|
||||
// TODO: can we parallelize this as well? Are there multiple sse execution units?
|
||||
// The strchr sse instruction returns the result in the lower bits of the sse
|
||||
// register. Since we only process 16 characters at a time, only the lower 16 bits
|
||||
// can contain non-zero values.
|
||||
// _mm_extract_epi16 will extract 16 bits out of the xmm register. The second
|
||||
int tuple_mask = 0;
|
||||
if (tuple_delim_ != '\0') {
|
||||
xmm_tuple_mask =
|
||||
_mm_cmpistrm(xmm_tuple_search_, xmm_buffer, SSEUtil::STRCHR_MODE);
|
||||
tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
|
||||
}
|
||||
int field_mask = 0;
|
||||
if (field_delim_ != '\0' || collection_item_delim_ != 0) {
|
||||
xmm_field_mask =
|
||||
_mm_cmpistrm(xmm_field_search_, xmm_buffer, SSEUtil::STRCHR_MODE);
|
||||
field_mask = _mm_extract_epi16(xmm_field_mask, 0);
|
||||
}
|
||||
|
||||
// parameter specifies which 16 bits to extract (0 for the lowest 16 bits).
|
||||
int escape_mask = 0;
|
||||
|
||||
// If the table does not use escape characters, skip processing for it.
|
||||
if (escape_char_ != '\0') {
|
||||
xmm_escape_mask = _mm_cmpistrm(xmm_escape_search_, xmm_buffer,
|
||||
SSEUtil::STRCHR_MODE);
|
||||
escape_mask = _mm_extract_epi16(xmm_escape_mask, 0);
|
||||
ProcessEscapeMask(escape_mask, &last_char_is_escape_, &field_mask, &tuple_mask);
|
||||
}
|
||||
|
||||
// Tuple delims are automatically field delims
|
||||
field_mask |= tuple_mask;
|
||||
|
||||
if (field_mask != 0) {
|
||||
// Loop through the mask and find the tuple/column offsets
|
||||
for (int n = 0; n < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++n) {
|
||||
if (escape_mask != 0) {
|
||||
current_column_has_escape_ =
|
||||
current_column_has_escape_ || (escape_mask & SSEUtil::SSE_BITMASK[n]);
|
||||
}
|
||||
|
||||
if (field_mask & SSEUtil::SSE_BITMASK[n]) {
|
||||
AddColumn((*byte_buffer_ptr + n) - *next_column_start,
|
||||
next_column_start, num_fields, field_locations);
|
||||
}
|
||||
|
||||
if (tuple_mask & SSEUtil::SSE_BITMASK[n]) {
|
||||
column_idx_ = start_column_;
|
||||
++(*num_tuples);
|
||||
if (*num_tuples == max_tuples) {
|
||||
(*byte_buffer_ptr) += (n + 1);
|
||||
last_char_is_escape_ = false;
|
||||
return Status::OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
current_column_has_escape_ = (current_column_has_escape_ || escape_mask);
|
||||
}
|
||||
|
||||
remaining_len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
*byte_buffer_ptr += SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the remaining characters
|
||||
while (remaining_len > 0) {
|
||||
bool new_tuple = false;
|
||||
bool new_col = false;
|
||||
|
||||
if (!last_char_is_escape_) {
|
||||
if (tuple_delim_ != '\0' && **byte_buffer_ptr == tuple_delim_) {
|
||||
new_tuple = true;
|
||||
new_col = true;
|
||||
} else if (**byte_buffer_ptr == field_delim_
|
||||
|| **byte_buffer_ptr == collection_item_delim_) {
|
||||
new_col = true;
|
||||
}
|
||||
}
|
||||
if (**byte_buffer_ptr == escape_char_) {
|
||||
current_column_has_escape_ = true;
|
||||
last_char_is_escape_ = !last_char_is_escape_;
|
||||
} else {
|
||||
last_char_is_escape_ = false;
|
||||
}
|
||||
|
||||
if (new_col) {
|
||||
AddColumn(*byte_buffer_ptr - *next_column_start,
|
||||
next_column_start, num_fields, field_locations);
|
||||
}
|
||||
|
||||
if (new_tuple) {
|
||||
column_idx_ = start_column_;
|
||||
++(*num_tuples);
|
||||
if (*num_tuples == max_tuples) {
|
||||
++*byte_buffer_ptr;
|
||||
return Status::OK;
|
||||
}
|
||||
}
|
||||
|
||||
--remaining_len;
|
||||
++*byte_buffer_ptr;
|
||||
|
||||
}
|
||||
|
||||
// For formats that store the length of the row the row is not delimited:
|
||||
// e.g. Sequene files.
|
||||
if (tuple_delim_ == '\0') {
|
||||
DCHECK(remaining_len == 0);
|
||||
AddColumn(*byte_buffer_ptr - *next_column_start,
|
||||
next_column_start, num_fields, field_locations);
|
||||
column_idx_ = start_column_;
|
||||
++(*num_tuples);
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Find the start of the first full tuple in buffer by looking for the end of
|
||||
// the previous tuple.
|
||||
// TODO: most of this is not tested. We need some tailored data to exercise the boundary
|
||||
// cases
|
||||
int DelimitedTextParser::FindFirstTupleStart(char* buffer, int len) {
|
||||
int tuple_start = 0;
|
||||
char* buffer_start = buffer;
|
||||
restart:
|
||||
if (CpuInfo::Instance()->IsSupported(CpuInfo::SSE4_2)) {
|
||||
__m128i xmm_buffer, xmm_tuple_mask;
|
||||
while (tuple_start < len) {
|
||||
// TODO: can we parallelize this as well? Are there multiple sse execution units?
|
||||
// Load the next 16 bytes into the xmm register and do strchr for the
|
||||
// tuple delimiter.
|
||||
int chr_count = len - tuple_start;
|
||||
if (chr_count > SSEUtil::CHARS_PER_128_BIT_REGISTER) {
|
||||
chr_count = SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
}
|
||||
xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(buffer));
|
||||
xmm_tuple_mask =
|
||||
_mm_cmpestrm(xmm_tuple_search_, 1, xmm_buffer, chr_count, SSEUtil::STRCHR_MODE);
|
||||
int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
|
||||
if (tuple_mask != 0) {
|
||||
for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
|
||||
if ((tuple_mask & SSEUtil::SSE_BITMASK[i]) != 0) {
|
||||
tuple_start += i + 1;
|
||||
buffer += i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
tuple_start += chr_count;
|
||||
buffer += chr_count;
|
||||
}
|
||||
} else {
|
||||
for (int i = tuple_start; i < len; ++i) {
|
||||
char c = *buffer++;
|
||||
if (c == tuple_delim_) {
|
||||
tuple_start = i + 1;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (escape_char_ != '\0') {
|
||||
// Scan backwards for escape characters. We do this after
|
||||
// finding the tuple break rather than during the (above)
|
||||
// forward scan to make the forward scan faster. This will
|
||||
// perform worse if there are many characters right before the
|
||||
// tuple break that are all escape characters, but that is
|
||||
// unlikely.
|
||||
int num_escape_chars = 0;
|
||||
int before_tuple_end = tuple_start - 2;
|
||||
for (; before_tuple_end >= 0; --before_tuple_end) {
|
||||
if (buffer_start[before_tuple_end] == escape_char_) {
|
||||
++num_escape_chars;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// TODO: This sucks. All the preceding characters before the tuple delim were
|
||||
// escape characters. We need to read from the previous block to see what to do.
|
||||
DCHECK_GT(before_tuple_end, 0);
|
||||
|
||||
// An even number of escape characters means they cancel out and this tuple break
|
||||
// is *not* escaped.
|
||||
if (num_escape_chars % 2 != 0) {
|
||||
goto restart;
|
||||
}
|
||||
}
|
||||
|
||||
return tuple_start;
|
||||
}
|
||||
142
be/src/exec/delimited-text-parser.h
Normal file
142
be/src/exec/delimited-text-parser.h
Normal file
@@ -0,0 +1,142 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#ifndef IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
|
||||
#define IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
|
||||
|
||||
#include "exec/hdfs-scanner.h"
|
||||
#include "exec/hdfs-scan-node.h"
|
||||
|
||||
namespace impala {
|
||||
|
||||
class DelimitedTextParser {
|
||||
public:
|
||||
// Intermediate structure used for two pass parsing approach. In the first pass,
|
||||
// the FieldLocation structs are filled out and contain where all the fields start and
|
||||
// their lengths. In the second pass, the FieldLocation is used to write out the
|
||||
// slots. We want to keep this struct as small as possible.
|
||||
struct FieldLocation {
|
||||
//start of field
|
||||
char* start;
|
||||
// Encodes the length and whether or not this fields needs to be unescaped.
|
||||
// If len < 0, then the field needs to be unescaped.
|
||||
int len;
|
||||
};
|
||||
|
||||
// The Delimited Text Parser parses text rows that are delimited by specific
|
||||
// characters:
|
||||
// tuple_delim: delimits tuples
|
||||
// field_delim: delimits fields
|
||||
// collection_item_delim: delimits collection items
|
||||
// escape_char: escape delimiters, make them part of the data.
|
||||
// Other parameters to the creator:
|
||||
// map_column_to_slot: maps a column in the input to the output slot.
|
||||
// start_column: the index in the above vector where the columns start.
|
||||
// it will be non-zero if there are partition columns.
|
||||
// timer: timer to use to time the parsing operation, or NULL.
|
||||
//
|
||||
// The main method is ParseData which fills in a vector of
|
||||
// pointers and lengths to the fields. It also can handle an excape character
|
||||
// which masks a tuple or field delimiter that occurs in the data.
|
||||
// FindFirstTupleStart returns the position after the first non-escaped tuple
|
||||
// delimiter from the starting offset.
|
||||
DelimitedTextParser(const std::vector<int>& map_column_to_slot, int start_column,
|
||||
RuntimeProfile::Counter* timer,
|
||||
char tuple_delim, char field_delim_ = '\0',
|
||||
char collection_item_delim = '\0', char escape_char = '\0');
|
||||
|
||||
// Called to initialize parser at beginning of scan range.
|
||||
void ParserReset();
|
||||
|
||||
// Check if we are at the start of a tuple.
|
||||
bool AtTupleStart() { return column_idx_ == start_column_; }
|
||||
|
||||
// Parses a byte buffer for the field and tuple breaks.
|
||||
// This function will write the field start & len to field_locations
|
||||
// which can then be written out to tuples.
|
||||
// This function uses SSE ("Intel x86 instruction set extension
|
||||
// 'Streaming Simd Extension') if the hardware supports SSE4.2
|
||||
// instructions. SSE4.2 added string processing instructions that
|
||||
// allow for processing 16 characters at a time. Otherwise, this
|
||||
// function walks the file_buffer_ character by character.
|
||||
// Input Parameters:
|
||||
// max_tuples: The maximum number of tuples that should be parsed.
|
||||
// This is used to control how the batching works.
|
||||
// remeing_len: Length of data remaining in the byte_buffer_pointer.
|
||||
// byte_buffer_pointer: Pointer to the buffer containing the data to be parsed.
|
||||
// Output Parameters:
|
||||
// field_locations: Vector of pointers to data fields and their lengths
|
||||
// num_tuples: Number of tuples parsed
|
||||
// num_fields: Number of materialized fields parsed
|
||||
// next_column_start: pointer within file_buffer_ where the next field starts
|
||||
// after the return from the call to ParseData
|
||||
Status ParseFieldLocations(int max_tuples, int64_t remaining_len,
|
||||
char** byte_buffer_ptr, std::vector<FieldLocation>* field_locations,
|
||||
int* num_tuples, int* num_fields, char** next_column_start);
|
||||
|
||||
// Find the start of a tuple if jumping into the middle of a file.
|
||||
// Returns the offset in the buffer of the tuple.
|
||||
int FindFirstTupleStart(char* buffer, int len);
|
||||
|
||||
// Will we return the current column to the query?
|
||||
bool ReturnCurrentColumn() {
|
||||
return map_column_to_slot_[column_idx_] != HdfsScanNode::SKIP_COLUMN;
|
||||
}
|
||||
|
||||
private:
|
||||
// Initialize the parser state.
|
||||
void ParserInit(HdfsScanNode* scan_node);
|
||||
|
||||
// Helper routine to add a column to the field_locations vector.
|
||||
// Input:
|
||||
// len: lenght of the current column.
|
||||
// Input/Output:
|
||||
// next_column_start: Start of the current column, moved to the start of the next.
|
||||
// num_fields: current number of fileds processed, updated to next field.
|
||||
// Output:
|
||||
// field_locations: updated with start and length of current field.
|
||||
void AddColumn(int len, char** next_column_start, int* num_fields,
|
||||
std::vector<FieldLocation>* field_locations);
|
||||
|
||||
// Map columns in the data to slots in the tuples.
|
||||
const std::vector<int>& map_column_to_slot_;
|
||||
|
||||
// First non-partition column that will be extracted from parsed data.
|
||||
int start_column_;
|
||||
|
||||
// Pointer to scan node parse time counter.
|
||||
RuntimeProfile::Counter* parse_time_counter_;
|
||||
|
||||
// SSE(xmm) register containing the tuple search character.
|
||||
__m128i xmm_tuple_search_;
|
||||
|
||||
// SSE(xmm) register containing the field search character.
|
||||
__m128i xmm_field_search_;
|
||||
|
||||
// SSE(xmm) register containing the escape search character.
|
||||
__m128i xmm_escape_search_;
|
||||
|
||||
// Character delimiting fields (to become slots).
|
||||
char field_delim_;
|
||||
|
||||
// Escape character.
|
||||
char escape_char_;
|
||||
|
||||
// Character delimiting collection items (to become slots).
|
||||
char collection_item_delim_;
|
||||
|
||||
// Character delimiting tuples.
|
||||
char tuple_delim_;
|
||||
|
||||
// Whether or not the current column has an escape character in it
|
||||
// (and needs to be unescaped)
|
||||
bool current_column_has_escape_;
|
||||
|
||||
// Whether or not the previous character was the escape character
|
||||
bool last_char_is_escape_;
|
||||
|
||||
// Index to keep track of the current current column in the current file
|
||||
int column_idx_;
|
||||
};
|
||||
|
||||
}// namespace impala
|
||||
#endif// IMPALA_EXEC_DELIMITED_TEXT_PARSER_H
|
||||
@@ -114,6 +114,7 @@ Status ExecNode::CreateNode(ObjectPool* pool, const TPlanNode& tnode,
|
||||
switch (tnode.node_type) {
|
||||
case TPlanNodeType::HDFS_TEXT_SCAN_NODE:
|
||||
case TPlanNodeType::HDFS_RCFILE_SCAN_NODE:
|
||||
case TPlanNodeType::HDFS_SEQFILE_SCAN_NODE:
|
||||
*node = pool->Add(new HdfsScanNode(pool, tnode, descs));
|
||||
return Status::OK;
|
||||
case TPlanNodeType::HBASE_SCAN_NODE:
|
||||
|
||||
@@ -8,6 +8,7 @@
|
||||
#include "util/jni-util.h"
|
||||
#include "util/runtime-profile.h"
|
||||
#include "gen-cpp/PlanNodes_types.h"
|
||||
#include "exec/text-converter.inline.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
@@ -90,12 +91,12 @@ void HBaseScanNode::WriteTextSlot(
|
||||
void* value, int value_length, SlotDescriptor* slot,
|
||||
RuntimeState* state, bool* error_in_row) {
|
||||
COUNTER_SCOPED_TIMER(tuple_write_timer());
|
||||
bool parsed_ok = text_converter_->ConvertAndWriteSlotBytes(reinterpret_cast<char*>(value),
|
||||
reinterpret_cast<char*>(value) + value_length, tuple_, slot, true, false);
|
||||
if (!parsed_ok) {
|
||||
if (!text_converter_->WriteSlot(state,
|
||||
slot, tuple_, reinterpret_cast<char*>(value), value_length, true, false).ok()) {
|
||||
*error_in_row = true;
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream() << "Error converting column " << family << ":" << qualifier << ": "
|
||||
state->error_stream() << "Error converting column " << family
|
||||
<< ":" << qualifier << ": "
|
||||
<< "'" << reinterpret_cast<char*>(value) << "' TO "
|
||||
<< TypeToString(slot->type()) << endl;
|
||||
}
|
||||
|
||||
@@ -79,3 +79,14 @@ Status HdfsByteStream::Seek(int64_t offset) {
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsByteStream::Eof(bool* eof) {
|
||||
hdfsFileInfo* hdfsInfo = hdfsGetPathInfo(hdfs_connection_, &location_[0]);
|
||||
if (hdfsInfo == NULL) {
|
||||
return Status("Error getting Info for HDFS file: " + location_);
|
||||
}
|
||||
*eof = hdfsTell(hdfs_connection_, hdfs_file_) >= hdfsInfo->mSize;
|
||||
|
||||
hdfsFreeFileInfo(hdfsInfo, 1);
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
@@ -22,6 +22,7 @@ class HdfsByteStream : public ByteStream {
|
||||
virtual Status Read(char *buf, int64_t req_length, int64_t* actual_length);
|
||||
virtual Status Seek(int64_t offset);
|
||||
virtual Status GetPosition(int64_t* position);
|
||||
virtual Status Eof(bool* eof);
|
||||
|
||||
private:
|
||||
hdfsFS hdfs_connection_;
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
#include "gen-cpp/PlanNodes_types.h"
|
||||
#include "exec/hdfs-rcfile-scanner.h"
|
||||
#include "exec/hdfs-scan-node.h"
|
||||
#include "exec/text-converter.inline.h"
|
||||
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
@@ -130,30 +131,30 @@ Status HdfsRCFileScanner::GetNext(
|
||||
|
||||
const char* col_start = row_group_->GetFieldPtr(rc_column_idx);
|
||||
int field_len = row_group_->GetFieldLength(rc_column_idx);
|
||||
bool parse_ok = true;
|
||||
Status parse_status;
|
||||
|
||||
switch (slot_desc->type()) {
|
||||
case TYPE_STRING:
|
||||
// TODO: Eliminate the unnecessary copy operation from the RCFileRowGroup
|
||||
// buffers to the tuple buffers by pushing the tuple buffer down into the
|
||||
// RowGroup class.
|
||||
parse_ok = text_converter_->ConvertAndWriteSlotBytes(col_start,
|
||||
col_start + field_len, tuple_, slot_desc, true, false);
|
||||
parse_status = text_converter_->WriteSlot(state, slot_desc, tuple_,
|
||||
col_start, field_len, true, false);
|
||||
break;
|
||||
default:
|
||||
// RCFile stores all fields as strings regardless of type, but these
|
||||
// strings are not NULL terminated. The strto* functions that TextConverter
|
||||
// uses require NULL terminated strings, so we have to manually NULL terminate
|
||||
// the strings before passing them to ConvertAndWriteSlotBytes.
|
||||
// the strings before passing them to WriteSlot
|
||||
// TODO: Devise a way to avoid this unecessary copy-and-terminate operation.
|
||||
string terminated_field(col_start, field_len);
|
||||
const char* c_str = terminated_field.c_str();
|
||||
parse_ok = text_converter_->ConvertAndWriteSlotBytes(c_str,
|
||||
c_str + field_len, tuple_, slot_desc, false, false);
|
||||
parse_status = text_converter_->WriteSlot(state, slot_desc, tuple_,
|
||||
c_str, field_len, false, false);
|
||||
break;
|
||||
}
|
||||
|
||||
if (!parse_ok) {
|
||||
if (!parse_status.ok()) {
|
||||
error_in_row = true;
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream() << "Error converting column: " << rc_column_idx <<
|
||||
|
||||
@@ -2,6 +2,7 @@
|
||||
|
||||
#include "exec/hdfs-scan-node.h"
|
||||
#include "exec/hdfs-text-scanner.h"
|
||||
#include "exec/hdfs-sequence-scanner.h"
|
||||
#include "exec/hdfs-rcfile-scanner.h"
|
||||
#include "exec/hdfs-byte-stream.h"
|
||||
|
||||
@@ -63,7 +64,7 @@ Status HdfsScanNode::InitRegex(ObjectPool* pool, const TPlanNode& tnode) {
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsScanNode::GetNext(RuntimeState* state, RowBatch* row_batch, bool *eos) {
|
||||
Status HdfsScanNode::GetNext(RuntimeState* state, RowBatch* row_batch, bool* eos) {
|
||||
COUNTER_SCOPED_TIMER(runtime_profile_->total_time_counter());
|
||||
|
||||
// Guard against trying to read an empty set of scan ranges
|
||||
@@ -134,6 +135,9 @@ Status HdfsScanNode::InitCurrentScanRange(RuntimeState* state) {
|
||||
current_scanner_.reset(new HdfsTextScanner(this, tuple_desc_, template_tuple_,
|
||||
tuple_pool_.get()));
|
||||
break;
|
||||
case TPlanNodeType::HDFS_SEQFILE_SCAN_NODE:
|
||||
current_scanner_.reset(new HdfsSequenceScanner(this, tuple_desc_, template_tuple_,
|
||||
tuple_pool_.get())); break;
|
||||
case TPlanNodeType::HDFS_RCFILE_SCAN_NODE:
|
||||
current_scanner_.reset(new HdfsRCFileScanner(this, tuple_desc_, template_tuple_,
|
||||
tuple_pool_.get()));
|
||||
|
||||
791
be/src/exec/hdfs-sequence-scanner.cc
Normal file
791
be/src/exec/hdfs-sequence-scanner.cc
Normal file
@@ -0,0 +1,791 @@
|
||||
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
|
||||
#include "runtime/runtime-state.h"
|
||||
#include "exec/hdfs-sequence-scanner.h"
|
||||
#include "runtime/tuple.h"
|
||||
#include "runtime/row-batch.h"
|
||||
#include "exec/text-converter.h"
|
||||
#include "util/cpu-info.h"
|
||||
#include "exec/hdfs-scan-node.h"
|
||||
#include "exec/delimited-text-parser.h"
|
||||
#include "exec/serde-utils.h"
|
||||
#include "exec/buffered-byte-stream.h"
|
||||
#include "exec/text-converter.inline.h"
|
||||
|
||||
// Compression libraries
|
||||
#include <zlib.h>
|
||||
#include <bzlib.h>
|
||||
#include <snappy.h>
|
||||
|
||||
using namespace std;
|
||||
using namespace boost;
|
||||
using namespace impala;
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_KEY_CLASS_NAME =
|
||||
"org.apache.hadoop.io.BytesWritable";
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME =
|
||||
"org.apache.hadoop.io.Text";
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_DEFAULT_COMPRESSION =
|
||||
"org.apache.hadoop.io.compress.DefaultCodec";
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_GZIP_COMPRESSION =
|
||||
"org.apache.hadoop.io.compress.GzipCodec";
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_BZIP2_COMPRESSION =
|
||||
"org.apache.hadoop.io.compress.BZip2Codec";
|
||||
|
||||
const char* const HdfsSequenceScanner::SEQFILE_SNAPPY_COMPRESSION =
|
||||
"org.apache.hadoop.io.compress.SnappyCodec";
|
||||
|
||||
const uint8_t HdfsSequenceScanner::SEQFILE_VERSION_HEADER[4] = {'S', 'E', 'Q', 6};
|
||||
|
||||
const int HdfsSequenceScanner::SEQFILE_KEY_LENGTH = 4;
|
||||
|
||||
// These are magic numbers from zlib.h. Not clear why they are not defined there.
|
||||
// 15 == window size, 32 == figure out if libz or gzip.
|
||||
#define WINDOW_BITS 15
|
||||
#define DETECT_CODEC 32
|
||||
|
||||
// Decompress a block encoded by gzip or lzip.
|
||||
// Inputs:
|
||||
// input_length: length of input buffer.
|
||||
// in: input buffer, contains compressed data
|
||||
// output_length: length of output buffer.
|
||||
// In/Out:
|
||||
// out: output buffer, place to put decompressed data.
|
||||
// Output:
|
||||
// too_small: set to true if the output_length is too small.
|
||||
static Status DecompressGzipBlock(int input_length, char* in,
|
||||
int output_length, char* out, bool* too_small) {
|
||||
z_stream stream;
|
||||
bzero(&stream, sizeof(stream));
|
||||
stream.next_in = reinterpret_cast<Bytef*>(in);
|
||||
stream.avail_in = input_length;
|
||||
stream.next_out = reinterpret_cast<Bytef*>(out);
|
||||
stream.avail_out = output_length;
|
||||
|
||||
*too_small = false;
|
||||
int ret;
|
||||
// Initialize and run either zlib or gzib inflate.
|
||||
if ((ret = inflateInit2(&stream, WINDOW_BITS | DETECT_CODEC)) != Z_OK) {
|
||||
stringstream ss;
|
||||
ss << "zlib inflateInit failed: " << stream.msg;
|
||||
return Status(ss.str());
|
||||
}
|
||||
if ((ret = inflate(&stream, 1)) != Z_STREAM_END) {
|
||||
(void)inflateEnd(&stream);
|
||||
if (ret == Z_OK) {
|
||||
*too_small = true;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
stringstream ss;
|
||||
ss << "zlib inflate failed: " << stream.msg;
|
||||
return Status(ss.str());
|
||||
}
|
||||
if (inflateEnd(&stream) != Z_OK) {
|
||||
stringstream ss;
|
||||
ss << "zlib inflateEnd failed: " << stream.msg;
|
||||
return Status(ss.str());
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Decompress a block encoded by bzip2.
|
||||
// Inputs:
|
||||
// input_length: length of input buffer.
|
||||
// in: input buffer, contains compressed data
|
||||
// output_length: length of output buffer.
|
||||
// In/Out:
|
||||
// out: output buffer, place to put decompressed data.
|
||||
// Output:
|
||||
// too_small: set to true if the output_length is too small.
|
||||
static Status DecompressBzip2Block(int input_length, char* in,
|
||||
int output_length, char* out, bool* too_small) {
|
||||
bz_stream stream;
|
||||
bzero(&stream, sizeof(stream));
|
||||
stream.next_in = in;
|
||||
stream.avail_in = input_length;
|
||||
stream.next_out = out;
|
||||
stream.avail_out = output_length;
|
||||
|
||||
*too_small = false;
|
||||
int ret;
|
||||
if ((ret = BZ2_bzDecompressInit(&stream, 0, 0)) != BZ_OK) {
|
||||
stringstream ss;
|
||||
ss << "bzlib BZ2_bzDecompressInit failed: " << ret;
|
||||
return Status(ss.str());
|
||||
}
|
||||
if ((ret = BZ2_bzDecompress(&stream)) != BZ_STREAM_END) {
|
||||
(void)BZ2_bzDecompressEnd(&stream);
|
||||
if (ret == BZ_OK) {
|
||||
*too_small = true;
|
||||
return Status::OK;
|
||||
}
|
||||
stringstream ss;
|
||||
ss << "bzlib BZ2_bzDecompress failed: " << ret;
|
||||
return Status(ss.str());
|
||||
}
|
||||
if ((ret = BZ2_bzDecompressEnd(&stream)) != BZ_OK) {
|
||||
stringstream ss;
|
||||
ss << "bzlib BZ2_bzDecompressEnd failed: " << ret;
|
||||
return Status(ss.str());
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Decompress a block encoded by Snappy.
|
||||
// Inputs:
|
||||
// input_length: length of input buffer.
|
||||
// in: input buffer, contains compressed data
|
||||
// output_length: length of output buffer.
|
||||
// In/Out:
|
||||
// out: output buffer, place to put decompressed data.
|
||||
// Output:
|
||||
// too_small: set to true if the output_length is too small.
|
||||
static Status DecompressSnappyBlock(int input_length, char* in,
|
||||
int output_length, char* out, bool* too_small) {
|
||||
*too_small = false;
|
||||
|
||||
// Hadoop uses a block compression scheme on top of snappy. First there is
|
||||
// an integer which is the size of the decompressed data followed by a
|
||||
// sequence of compressed blocks each preceded with an integer size.
|
||||
int32_t len;
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadInt(in, &len));
|
||||
|
||||
// TODO: Snappy knows how big the output is, we should just use that.
|
||||
if (output_length < len) {
|
||||
*too_small = true;
|
||||
return Status::OK;
|
||||
}
|
||||
in += sizeof(len);
|
||||
input_length -= sizeof(len);
|
||||
|
||||
do {
|
||||
// Read the length of the next block.
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadInt(in, &len));
|
||||
|
||||
if (len == 0) break;
|
||||
|
||||
in += sizeof(len);
|
||||
input_length -= sizeof(len);
|
||||
|
||||
// Read how big the output will be.
|
||||
size_t uncompressed_len;
|
||||
if (!snappy::GetUncompressedLength(static_cast<const char*>(in),
|
||||
input_length, &uncompressed_len)) {
|
||||
return Status("Snappy: GetUncompressedLength failed");
|
||||
}
|
||||
|
||||
DCHECK_GT(output_length, 0);
|
||||
if (!snappy::RawUncompress(static_cast<const char*>(in),
|
||||
static_cast<size_t>(len), out)) {
|
||||
return Status("Snappy: RawUncompress failed");
|
||||
}
|
||||
in += len;
|
||||
input_length -= len;
|
||||
out += uncompressed_len;
|
||||
output_length -= uncompressed_len;
|
||||
} while (input_length > 0);
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
HdfsSequenceScanner::HdfsSequenceScanner(HdfsScanNode* scan_node,
|
||||
const TupleDescriptor* tuple_desc,
|
||||
Tuple* template_tuple, MemPool* tuple_pool)
|
||||
: HdfsScanner(scan_node, tuple_desc, template_tuple, tuple_pool),
|
||||
delimited_text_parser_(NULL),
|
||||
text_converter_(NULL),
|
||||
unparsed_data_buffer_pool_(new MemPool()),
|
||||
unparsed_data_buffer_(NULL),
|
||||
unparsed_data_buffer_size_(0),
|
||||
num_buffered_records_in_compressed_block_(0) {
|
||||
const HdfsTableDescriptor* hdfs_table =
|
||||
static_cast<const HdfsTableDescriptor*>(tuple_desc->table_desc());
|
||||
|
||||
text_converter_.reset(new TextConverter(hdfs_table->escape_char(), tuple_pool_));
|
||||
|
||||
delimited_text_parser_.reset(new DelimitedTextParser(scan_node->column_to_slot_index(),
|
||||
scan_node->GetNumPartitionKeys(), NULL, '\0',
|
||||
hdfs_table->field_delim(), hdfs_table->collection_delim(),
|
||||
hdfs_table->escape_char()));
|
||||
// use the parser to find bytes that are -1
|
||||
find_first_parser_.reset(new DelimitedTextParser(scan_node->column_to_slot_index(),
|
||||
scan_node->GetNumPartitionKeys(), scan_node->parse_time_counter(),
|
||||
static_cast<char>(0xff)));
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::InitCurrentScanRange(RuntimeState* state,
|
||||
HdfsScanRange* scan_range,
|
||||
ByteStream* byte_stream) {
|
||||
HdfsScanner::InitCurrentScanRange(state, scan_range, byte_stream);
|
||||
end_of_scan_range_ = scan_range->length + scan_range->offset;
|
||||
unbuffered_byte_stream_ = byte_stream;
|
||||
|
||||
// If the file is blocked compressed then we don't want to double buffer
|
||||
// the compressed blocks. In that case we read meta information in
|
||||
// filesystem block sizes (4kb) otherwise we read large chunks (1Mb)
|
||||
// and pick meta data and data from that buffer.
|
||||
buffered_byte_stream_.reset(new BufferedByteStream(
|
||||
unbuffered_byte_stream_,
|
||||
is_blk_compressed_ ? FILE_BLOCK_SIZE : state->file_buffer_size(),
|
||||
scan_node_->scanner_timer()));
|
||||
|
||||
// Check the Location (file name) to see if we have changed files.
|
||||
// If this a new file then we need to read and process the header.
|
||||
if (previous_location_ != unbuffered_byte_stream_->GetLocation()) {
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->Seek(0));
|
||||
RETURN_IF_ERROR(ReadFileHeader());
|
||||
if (is_blk_compressed_) {
|
||||
unparsed_data_buffer_size_ = state->file_buffer_size();
|
||||
}
|
||||
previous_location_ = unbuffered_byte_stream_->GetLocation();
|
||||
}
|
||||
|
||||
delimited_text_parser_->ParserReset();
|
||||
|
||||
// Offset may not point to record boundary
|
||||
if (scan_range->offset != 0) {
|
||||
RETURN_IF_ERROR(unbuffered_byte_stream_->Seek(scan_range->offset));
|
||||
RETURN_IF_ERROR(FindFirstRecord(state));
|
||||
}
|
||||
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// The start of the sync block is specified by an integer of -1. We search
|
||||
// bytes till we find a -1 and then look for 3 more -1 bytes which will make up
|
||||
// the integer. This is followed by the 16 byte sync block which was specified in
|
||||
// the file header.
|
||||
Status HdfsSequenceScanner::FindFirstRecord(RuntimeState* state) {
|
||||
// A sync block is preceeded by 4 bytes of -1 (0xff).
|
||||
int sync_flag_counter = 0;
|
||||
// Starting offset of the buffer we are scanning
|
||||
int64_t buf_start = 0;
|
||||
// Number of bytes read from stream
|
||||
int64_t num_bytes_read = 0;
|
||||
// Current offset into buffer.
|
||||
int64_t off = 0;
|
||||
// Bytes left to process in buffer.
|
||||
int64_t bytes_left = 0;
|
||||
// Size of buffer to read.
|
||||
int64_t read_size = FILE_BLOCK_SIZE;
|
||||
// Buffer to scan.
|
||||
char buf[read_size];
|
||||
|
||||
// Loop until we find a Sync block or get to the end of the range.
|
||||
while (buf_start + off < end_of_scan_range_ || sync_flag_counter != 0) {
|
||||
// If there are no bytes left to process in the buffer get some more.
|
||||
// We may make bytes_left < 0 while looping for 0xff bytes below.
|
||||
if (bytes_left <= 0) {
|
||||
if (buf_start == 0) {
|
||||
RETURN_IF_ERROR(unbuffered_byte_stream_->GetPosition(&buf_start));
|
||||
} else {
|
||||
// Seek to the next buffer, in case we read the byte stream below.
|
||||
buf_start += num_bytes_read;
|
||||
#ifndef NDEBUG
|
||||
int64_t position;
|
||||
RETURN_IF_ERROR(unbuffered_byte_stream_->GetPosition(&position));
|
||||
DCHECK_EQ(buf_start, position);
|
||||
#endif
|
||||
}
|
||||
// Do not read past the end of range, unless we stopped at a -1 byte.
|
||||
// This could be the start of a sync block and we must process the
|
||||
// following data.
|
||||
if (buf_start + read_size >= end_of_scan_range_) {
|
||||
read_size = (end_of_scan_range_ - buf_start);
|
||||
if (sync_flag_counter != 0 && read_size < 4 - sync_flag_counter) {
|
||||
read_size = 4 - sync_flag_counter;
|
||||
}
|
||||
}
|
||||
if (read_size == 0) {
|
||||
return Status::OK;
|
||||
}
|
||||
RETURN_IF_ERROR(unbuffered_byte_stream_->Read(buf, read_size, &num_bytes_read));
|
||||
off = 0;
|
||||
if (num_bytes_read == 0) {
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->SeekToParent());
|
||||
return Status::OK;
|
||||
}
|
||||
bytes_left = num_bytes_read;
|
||||
}
|
||||
|
||||
if (sync_flag_counter == 0) {
|
||||
off += find_first_parser_->FindFirstTupleStart(buf + off, bytes_left);
|
||||
bytes_left = num_bytes_read - off;
|
||||
|
||||
if (bytes_left == 0) continue;
|
||||
|
||||
sync_flag_counter = 1;
|
||||
}
|
||||
|
||||
// We found a -1 see if there are 3 more
|
||||
while (bytes_left != 0) {
|
||||
--bytes_left;
|
||||
if (buf[off++] != static_cast<char>(0xff)) {
|
||||
sync_flag_counter = 0;
|
||||
break;
|
||||
}
|
||||
|
||||
if (++sync_flag_counter == 4) {
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->Seek(buf_start + off));
|
||||
bool verified;
|
||||
RETURN_IF_ERROR(CheckSync(false, &verified));
|
||||
if (verified) {
|
||||
// Seek back to the beginning of the sync so the protocol readers are right.
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->Seek(buf_start + off - 4));
|
||||
return Status::OK;
|
||||
}
|
||||
sync_flag_counter = 0;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->SeekToParent());
|
||||
return Status::OK;
|
||||
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::Prepare(RuntimeState* state, ByteStream* byte_stream) {
|
||||
RETURN_IF_ERROR(HdfsScanner::Prepare(state, byte_stream));
|
||||
|
||||
// Allocate the scratch space for two pass parsing. The most fields we can go
|
||||
// through in one parse pass is the batch size (tuples) * the number of fields per tuple
|
||||
// TODO: This should probably be based on L2/L3 cache sizes (as should the batch size)
|
||||
field_locations_.resize(state->batch_size() * scan_node_->materialized_slots().size());
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
inline Status HdfsSequenceScanner::GetRecordFromCompressedBlock(RuntimeState *state,
|
||||
char** record_ptr,
|
||||
int64_t* record_len,
|
||||
bool* eosr) {
|
||||
if (num_buffered_records_in_compressed_block_ == 0) {
|
||||
int64_t position;
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->GetPosition(&position));
|
||||
if (position >= end_of_scan_range_) {
|
||||
*eosr = true;
|
||||
return Status::OK;
|
||||
}
|
||||
RETURN_IF_ERROR(ReadCompressedBlock(state));
|
||||
}
|
||||
// Adjust next_record_ to move past the size of the length indicator.
|
||||
int size = SerDeUtils::ReadVLong(next_record_in_compressed_block_, record_len);
|
||||
next_record_in_compressed_block_ += size;
|
||||
*record_ptr = next_record_in_compressed_block_;
|
||||
// Point to the length of the next record.
|
||||
next_record_in_compressed_block_ += *record_len;
|
||||
--num_buffered_records_in_compressed_block_;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
inline Status HdfsSequenceScanner::GetRecord(char** record_ptr,
|
||||
int64_t* record_len, bool* eosr) {
|
||||
int64_t position;
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->GetPosition(&position));
|
||||
if (position >= end_of_scan_range_) {
|
||||
*eosr = true;
|
||||
}
|
||||
|
||||
// If we are past the end of the range we must read to the next sync block.
|
||||
// TODO: We need better error returns from bytestream functions.
|
||||
bool sync;
|
||||
Status stat = ReadBlockHeader(&sync);
|
||||
if (!stat.ok()) {
|
||||
// Since we are past the end of the range then we might be at the end of the file.
|
||||
bool eof;
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->Eof(&eof));
|
||||
|
||||
if (!*eosr || !eof) {
|
||||
return stat;
|
||||
} else {
|
||||
return Status::OK;
|
||||
}
|
||||
}
|
||||
|
||||
if (sync && *eosr) return Status::OK;
|
||||
*eosr = false;
|
||||
|
||||
// We don't look at the keys, only the values.
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::SkipBytes(buffered_byte_stream_.get(), current_key_length_));
|
||||
|
||||
// Reading a compressed record, we don't know how big the output is.
|
||||
// If we are told our output buffer is too small, double it and try again.
|
||||
if (is_compressed_) {
|
||||
int in_size = current_block_length_ - current_key_length_;
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadBytes(buffered_byte_stream_.get(), in_size, &scratch_buf_));
|
||||
|
||||
int out_size = in_size;
|
||||
bool too_small = false;
|
||||
do {
|
||||
out_size *= 2;
|
||||
if (has_string_slots_ || unparsed_data_buffer_size_ < out_size) {
|
||||
unparsed_data_buffer_ = unparsed_data_buffer_pool_->Allocate(out_size);
|
||||
unparsed_data_buffer_size_ = out_size;
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(decompress_block_function_(in_size, &scratch_buf_[0],
|
||||
out_size, unparsed_data_buffer_, &too_small));
|
||||
} while (too_small);
|
||||
|
||||
*record_ptr = unparsed_data_buffer_;
|
||||
// Read the length of the record.
|
||||
int size = SerDeUtils::ReadVLong(*record_ptr, record_len);
|
||||
*record_ptr += size;
|
||||
} else {
|
||||
// Uncompressed records
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadVLong(buffered_byte_stream_.get(), record_len));
|
||||
if (has_string_slots_ || *record_len > unparsed_data_buffer_size_) {
|
||||
unparsed_data_buffer_ = unparsed_data_buffer_pool_->Allocate(*record_len);
|
||||
unparsed_data_buffer_size_ = *record_len;
|
||||
}
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadBytes(buffered_byte_stream_.get(),
|
||||
*record_len, unparsed_data_buffer_));
|
||||
*record_ptr = unparsed_data_buffer_;
|
||||
}
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Add rows to the row_batch until it is full or we run off the end of the scan range.
|
||||
Status HdfsSequenceScanner::GetNext(RuntimeState* state,
|
||||
RowBatch* row_batch, bool* eosr) {
|
||||
AllocateTupleBuffer(row_batch);
|
||||
// Index into current row in row_batch.
|
||||
int row_idx = RowBatch::INVALID_ROW_INDEX;
|
||||
runtime_state_ = state;
|
||||
|
||||
// We count the time here since there is too much overhead to do
|
||||
// this on each record.
|
||||
COUNTER_SCOPED_TIMER(scan_node_->parse_time_counter());
|
||||
|
||||
// Read records from the sequence file and parse the data for each record into
|
||||
// columns. These are added to the row_batch. The loop continues until either
|
||||
// the row batch is full or we are off the end of the range.
|
||||
while (true) {
|
||||
// Current record to process and its length.
|
||||
char* record = NULL;
|
||||
int64_t record_len;
|
||||
// Get the next record and record length.
|
||||
// There are 3 cases:
|
||||
// Block compressed -- each block contains several records.
|
||||
// Record compressed -- like a regular record, but the data is compressed.
|
||||
// Uncompressed.
|
||||
if (is_blk_compressed_) {
|
||||
RETURN_IF_ERROR(GetRecordFromCompressedBlock(state, &record, &record_len, eosr));
|
||||
} else {
|
||||
// Get the next compressed or uncompressed record.
|
||||
RETURN_IF_ERROR(GetRecord(&record, &record_len, eosr));
|
||||
}
|
||||
|
||||
if (*eosr) break;
|
||||
|
||||
// Parse the current record.
|
||||
if (scan_node_->materialized_slots().size() != 0) {
|
||||
char* col_start;
|
||||
char* record_start = record;
|
||||
int num_tuples = 0;
|
||||
int num_fields = 0;
|
||||
|
||||
RETURN_IF_ERROR(delimited_text_parser_->ParseFieldLocations(
|
||||
row_batch->capacity() - row_batch->num_rows(), record_len, &record,
|
||||
&field_locations_, &num_tuples, &num_fields, &col_start));
|
||||
DCHECK(num_tuples == 1);
|
||||
|
||||
if (num_fields != 0) {
|
||||
if (!WriteFields(state, row_batch, num_fields, &row_idx).ok()) {
|
||||
// Report all the fields that have errors.
|
||||
++num_errors_in_file_;
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream() << "file: "
|
||||
<< buffered_byte_stream_->GetLocation() << endl;
|
||||
state->error_stream() << "record: ";
|
||||
state->error_stream() << string(record_start, record_len);
|
||||
state->LogErrorStream();
|
||||
}
|
||||
if (state->abort_on_error()) {
|
||||
state->ReportFileErrors(buffered_byte_stream_->GetLocation(), 1);
|
||||
return Status("Aborted HdfsSequenceScanner due to parse errors."
|
||||
"View error log for details.");
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
RETURN_IF_ERROR(WriteTuples(state, row_batch, 1, &row_idx));
|
||||
}
|
||||
if (row_batch->IsFull()) {
|
||||
row_batch->tuple_data_pool()->AcquireData(tuple_pool_, true);
|
||||
*eosr = false;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (has_string_slots_) {
|
||||
// Pass the buffer data to the row_batch.
|
||||
// If we are at the end of a scan range then release the ownership
|
||||
row_batch->tuple_data_pool()->AcquireData(unparsed_data_buffer_pool_.get(), !*eosr);
|
||||
}
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// TODO: apply conjuncts as slots get materialized and skip to the end of the row
|
||||
// if we determine it's not a match.
|
||||
Status HdfsSequenceScanner::WriteFields(RuntimeState* state, RowBatch* row_batch,
|
||||
int num_fields, int* row_idx) {
|
||||
// This has too much overhead to do it per-tuple
|
||||
// COUNTER_SCOPED_TIMER(scan_node_->tuple_write_timer());
|
||||
DCHECK_EQ(num_fields, scan_node_->materialized_slots().size());
|
||||
|
||||
// Keep track of where lines begin as we write out fields for error reporting
|
||||
int next_line_offset = 0;
|
||||
|
||||
// Initialize tuple_ from the partition key template tuple before writing the slots
|
||||
if (template_tuple_ != NULL) {
|
||||
memcpy(tuple_, template_tuple_, tuple_byte_size_);
|
||||
}
|
||||
|
||||
// Loop through all the parsed_data and parse out the values to slots
|
||||
bool error_in_row = false;
|
||||
for (int n = 0; n < num_fields; ++n) {
|
||||
int need_escape = false;
|
||||
int len = field_locations_[n].len;
|
||||
if (len < 0) {
|
||||
len = -len;
|
||||
need_escape = true;
|
||||
}
|
||||
next_line_offset += (len + 1);
|
||||
|
||||
if (!text_converter_->WriteSlot(state, scan_node_->materialized_slots()[n].second,
|
||||
tuple_, field_locations_[n].start, len, false, need_escape).ok()) {
|
||||
error_in_row = true;
|
||||
}
|
||||
}
|
||||
|
||||
DCHECK_EQ(num_fields, scan_node_->materialized_slots().size());
|
||||
|
||||
// TODO: The code from here down is more or less common to all scanners. Move it.
|
||||
// We now have a complete row, with everything materialized
|
||||
DCHECK(!row_batch->IsFull());
|
||||
if (*row_idx == RowBatch::INVALID_ROW_INDEX) {
|
||||
*row_idx = row_batch->AddRow();
|
||||
}
|
||||
TupleRow* current_row = row_batch->GetRow(*row_idx);
|
||||
current_row->SetTuple(tuple_idx_, tuple_);
|
||||
|
||||
// Evaluate the conjuncts and add the row to the batch
|
||||
bool conjuncts_true = scan_node_->EvalConjunctsForScanner(current_row);
|
||||
|
||||
if (conjuncts_true) {
|
||||
row_batch->CommitLastRow();
|
||||
*row_idx = RowBatch::INVALID_ROW_INDEX;
|
||||
scan_node_->IncrNumRowsReturned();
|
||||
if (scan_node_->ReachedLimit() || row_batch->IsFull()) {
|
||||
tuple_ = NULL;
|
||||
return Status::OK;
|
||||
}
|
||||
char* new_tuple = reinterpret_cast<char*>(tuple_);
|
||||
new_tuple += tuple_byte_size_;
|
||||
tuple_ = reinterpret_cast<Tuple*>(new_tuple);
|
||||
}
|
||||
|
||||
// Need to reset the tuple_ if
|
||||
// 1. eval failed (clear out null-indicator bits) OR
|
||||
// 2. there are partition keys that need to be copied
|
||||
// TODO: if the slots that need to be updated are very sparse (very few NULL slots
|
||||
// or very few partition keys), updating all the tuple memory is probably bad
|
||||
if (!conjuncts_true || template_tuple_ != NULL) {
|
||||
if (template_tuple_ != NULL) {
|
||||
memcpy(tuple_, template_tuple_, tuple_byte_size_);
|
||||
} else {
|
||||
tuple_->Init(tuple_byte_size_);
|
||||
}
|
||||
}
|
||||
|
||||
if (error_in_row) return Status("Conversion from string failed");
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::ReadFileHeader() {
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadBytes(buffered_byte_stream_.get(),
|
||||
sizeof(SEQFILE_VERSION_HEADER), &scratch_buf_));
|
||||
if (memcmp(&scratch_buf_[0], SEQFILE_VERSION_HEADER, sizeof(SEQFILE_VERSION_HEADER))) {
|
||||
if (runtime_state_->LogHasSpace()) {
|
||||
runtime_state_->error_stream() << "Invalid SEQFILE_VERSION_HEADER: '"
|
||||
<< SerDeUtils::HexDump(&scratch_buf_[0], sizeof(SEQFILE_VERSION_HEADER)) << "'";
|
||||
}
|
||||
return Status("Invalid SEQFILE_VERSION_HEADER");
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadText(buffered_byte_stream_.get(), &scratch_buf_));
|
||||
if (strncmp(&scratch_buf_[0],
|
||||
HdfsSequenceScanner::SEQFILE_KEY_CLASS_NAME, scratch_buf_.size())) {
|
||||
if (runtime_state_->LogHasSpace()) {
|
||||
runtime_state_->error_stream() << "Invalid SEQFILE_KEY_CLASS_NAME: '"
|
||||
<< string(&scratch_buf_[0], strlen(HdfsSequenceScanner::SEQFILE_KEY_CLASS_NAME))
|
||||
<< "'";
|
||||
}
|
||||
return Status("Invalid SEQFILE_KEY_CLASS_NAME");
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadText(buffered_byte_stream_.get(), &scratch_buf_));
|
||||
if (strncmp(&scratch_buf_[0], HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME,
|
||||
scratch_buf_.size())) {
|
||||
if (runtime_state_->LogHasSpace()) {
|
||||
runtime_state_->error_stream() << "Invalid SEQFILE_VALUE_CLASS_NAME: '"
|
||||
<< string(
|
||||
scratch_buf_[0], strlen(HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME))
|
||||
<< "'";
|
||||
}
|
||||
return Status("Invalid SEQFILE_VALUE_CLASS_NAME");
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadBoolean(buffered_byte_stream_.get(), &is_compressed_));
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadBoolean(buffered_byte_stream_.get(), &is_blk_compressed_));
|
||||
|
||||
if (is_compressed_) {
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadText(buffered_byte_stream_.get(), &compression_codec_));
|
||||
RETURN_IF_ERROR(SetCompression());
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(ReadFileHeaderMetadata());
|
||||
RETURN_IF_ERROR(ReadSync());
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::SetCompression() {
|
||||
if (strncmp(&compression_codec_[0], HdfsSequenceScanner::SEQFILE_DEFAULT_COMPRESSION,
|
||||
compression_codec_.size()) == 0 ||
|
||||
strncmp(&compression_codec_[0], HdfsSequenceScanner::SEQFILE_GZIP_COMPRESSION,
|
||||
compression_codec_.size()) == 0) {
|
||||
decompress_block_function_ = DecompressGzipBlock;
|
||||
|
||||
} else if (strncmp(&compression_codec_[0],
|
||||
HdfsSequenceScanner::SEQFILE_BZIP2_COMPRESSION, compression_codec_.size()) == 0) {
|
||||
decompress_block_function_ = DecompressBzip2Block;
|
||||
|
||||
} else if (strncmp(&compression_codec_[0],
|
||||
HdfsSequenceScanner::SEQFILE_SNAPPY_COMPRESSION, compression_codec_.size()) == 0) {
|
||||
decompress_block_function_ = DecompressSnappyBlock;
|
||||
} else {
|
||||
if (runtime_state_->LogHasSpace()) {
|
||||
runtime_state_->error_stream() << "Unknown Codec: "
|
||||
<< string(&compression_codec_[0], compression_codec_.size());
|
||||
}
|
||||
return Status("Unknown Codec");
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::ReadFileHeaderMetadata() {
|
||||
int map_size = 0;
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadInt(buffered_byte_stream_.get(), &map_size));
|
||||
|
||||
for (int i = 0; i < map_size; ++i) {
|
||||
RETURN_IF_ERROR(SerDeUtils::SkipText(buffered_byte_stream_.get()));
|
||||
RETURN_IF_ERROR(SerDeUtils::SkipText(buffered_byte_stream_.get()));
|
||||
|
||||
}
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::ReadSync() {
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadBytes(buffered_byte_stream_.get(), SYNC_HASH_SIZE, sync_));
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::ReadBlockHeader(bool* sync) {
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadInt(buffered_byte_stream_.get(), ¤t_block_length_));
|
||||
*sync = false;
|
||||
if (current_block_length_ == HdfsSequenceScanner::SYNC_MARKER) {
|
||||
RETURN_IF_ERROR(CheckSync(true, NULL));
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadInt(buffered_byte_stream_.get(), ¤t_block_length_));
|
||||
*sync = true;
|
||||
}
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadInt(buffered_byte_stream_.get(), ¤t_key_length_));
|
||||
DCHECK_EQ(current_key_length_, SEQFILE_KEY_LENGTH);
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status HdfsSequenceScanner::CheckSync(bool report_error, bool* verified) {
|
||||
char hash[SYNC_HASH_SIZE];
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadBytes(buffered_byte_stream_.get(),
|
||||
HdfsSequenceScanner::SYNC_HASH_SIZE, hash));
|
||||
|
||||
bool sync_compares_equal = memcmp(static_cast<void*>(hash),
|
||||
static_cast<void*>(sync_), HdfsSequenceScanner::SYNC_HASH_SIZE) == 0;
|
||||
if (report_error && !sync_compares_equal) {
|
||||
if (runtime_state_->LogHasSpace()) {
|
||||
runtime_state_->error_stream() << "Bad sync hash in current HdfsSequenceScanner: "
|
||||
<< buffered_byte_stream_->GetLocation() << "." << endl
|
||||
<< "Expected: '"
|
||||
<< SerDeUtils::HexDump(sync_, HdfsSequenceScanner::SYNC_HASH_SIZE)
|
||||
<< "'" << endl
|
||||
<< "Actual: '"
|
||||
<< SerDeUtils::HexDump(hash, HdfsSequenceScanner::SYNC_HASH_SIZE)
|
||||
<< "'" << endl;
|
||||
}
|
||||
return Status("Bad sync hash");
|
||||
}
|
||||
if (verified != NULL) *verified = sync_compares_equal;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
|
||||
Status HdfsSequenceScanner::ReadCompressedBlock(RuntimeState* state) {
|
||||
int dummy;
|
||||
// Read the sync indicator and check the sync block.
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadInt(buffered_byte_stream_.get(), &dummy));
|
||||
RETURN_IF_ERROR(CheckSync(true, NULL));
|
||||
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadVLong(buffered_byte_stream_.get(),
|
||||
&num_buffered_records_in_compressed_block_));
|
||||
|
||||
// Read the compressed key length and key buffers, we don't need them.
|
||||
RETURN_IF_ERROR(SerDeUtils::SkipText(buffered_byte_stream_.get()));
|
||||
RETURN_IF_ERROR(SerDeUtils::SkipText(buffered_byte_stream_.get()));
|
||||
|
||||
// Read the compressed value length buffer. We don't need these either since the
|
||||
// records are in Text format with length included.
|
||||
RETURN_IF_ERROR(SerDeUtils::SkipText(buffered_byte_stream_.get()));
|
||||
|
||||
// Read the compressed value buffer from the unbuffered stream.
|
||||
int block_size;
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadVInt(buffered_byte_stream_.get(), &block_size));
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->SyncParent());
|
||||
{
|
||||
COUNTER_SCOPED_TIMER(scan_node_->scanner_timer());
|
||||
RETURN_IF_ERROR(
|
||||
SerDeUtils::ReadBytes(unbuffered_byte_stream_, block_size, &scratch_buf_));
|
||||
}
|
||||
RETURN_IF_ERROR(buffered_byte_stream_->SeekToParent());
|
||||
|
||||
bool too_small = false;
|
||||
do {
|
||||
if (too_small || has_string_slots_ || unparsed_data_buffer_ == NULL) {
|
||||
unparsed_data_buffer_ =
|
||||
unparsed_data_buffer_pool_->Allocate(unparsed_data_buffer_size_);
|
||||
}
|
||||
|
||||
RETURN_IF_ERROR(decompress_block_function_(block_size,
|
||||
&scratch_buf_[0], unparsed_data_buffer_size_, unparsed_data_buffer_, &too_small));
|
||||
|
||||
if (too_small) {
|
||||
unparsed_data_buffer_size_ *= 2;
|
||||
}
|
||||
} while (too_small);
|
||||
|
||||
next_record_in_compressed_block_ = unparsed_data_buffer_;
|
||||
return Status::OK;
|
||||
}
|
||||
329
be/src/exec/hdfs-sequence-scanner.h
Normal file
329
be/src/exec/hdfs-sequence-scanner.h
Normal file
@@ -0,0 +1,329 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#ifndef IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
|
||||
#define IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
|
||||
|
||||
#include "exec/hdfs-scanner.h"
|
||||
#include "exec/buffered-byte-stream.h"
|
||||
#include "exec/delimited-text-parser.h"
|
||||
|
||||
namespace impala {
|
||||
|
||||
// This scanner parses Sequence file located in HDFS, and writes the
|
||||
// content as tuples in the Impala in-memory representation of data, e.g.
|
||||
// (tuples, rows, row batches).
|
||||
// org.apache.hadoop.io.SequenceFile is the original SequenceFile implementation
|
||||
// and should be viewed as the canonical definition of this format. If
|
||||
// anything is unclear in this file you should consult the code in
|
||||
// org.apache.hadoop.io.SequenceFile.
|
||||
//
|
||||
// The following is a pseudo-BNF grammar for SequenceFile. Comments are prefixed
|
||||
// with dashes:
|
||||
//
|
||||
// seqfile ::=
|
||||
// <file-header>
|
||||
// <record-block>+
|
||||
//
|
||||
// record-block ::=
|
||||
// <record>+
|
||||
// <file-sync-hash>
|
||||
//
|
||||
// file-header ::=
|
||||
// <file-version-header>
|
||||
// <file-key-class-name>
|
||||
// <file-value-class-name>
|
||||
// <file-is-compressed>
|
||||
// <file-is-block-compressed>
|
||||
// [<file-compression-codec-class>]
|
||||
// <file-header-metadata>
|
||||
// <file-sync-field>
|
||||
//
|
||||
// file-version-header ::= Byte[4] {'S', 'E', 'Q', 6}
|
||||
//
|
||||
// -- The name of the Java class responsible for reading the key buffer
|
||||
//
|
||||
// file-key-class-name ::=
|
||||
// Text {"org.apache.hadoop.io.BytesWritable"}
|
||||
//
|
||||
// -- The name of the Java class responsible for reading the value buffer
|
||||
//
|
||||
// file-value-class-name ::=
|
||||
// Text {"org.apache.hadoop.io.Text"}
|
||||
//
|
||||
// -- Boolean variable indicating whether or not the file uses compression
|
||||
// -- for key/values in this file
|
||||
//
|
||||
// file-is-compressed ::= Byte[1]
|
||||
//
|
||||
// -- A boolean field indicating whether or not the file is block compressed.
|
||||
//
|
||||
// file-is-block-compressed ::= Byte[1] {false}
|
||||
//
|
||||
// -- The Java class name of the compression codec iff <file-is-compressed>
|
||||
// -- is true. The named class must implement
|
||||
// -- org.apache.hadoop.io.compress.CompressionCodec.
|
||||
// -- The expected value is org.apache.hadoop.io.compress.GzipCodec.
|
||||
//
|
||||
// file-compression-codec-class ::= Text
|
||||
//
|
||||
// -- A collection of key-value pairs defining metadata values for the
|
||||
// -- file. The Map is serialized using standard JDK serialization, i.e.
|
||||
// -- an Int corresponding to the number of key-value pairs, followed by
|
||||
// -- Text key and value pairs.
|
||||
//
|
||||
// file-header-metadata ::= Map<Text, Text>
|
||||
//
|
||||
// -- A 16 byte marker that is generated by the writer. This marker appears
|
||||
// -- at regular intervals at the beginning of records or record blocks
|
||||
// -- intended to enable readers to skip to a random part of the file
|
||||
// -- the sync hash is preceeded by a length of -1, refered to as the sync marker
|
||||
//
|
||||
// file-sync-hash ::= Byte[16]
|
||||
//
|
||||
// -- Records are all of one type as determined by the compression bits in the header
|
||||
//
|
||||
// record ::=
|
||||
// <uncompressed-record> |
|
||||
// <block-compressed-record> |
|
||||
// <record-compressed-record>
|
||||
//
|
||||
// uncompressed-record ::=
|
||||
// <record-length>
|
||||
// <key-length>
|
||||
// <key>
|
||||
// <value>
|
||||
//
|
||||
// record-compressed-record ::=
|
||||
// <record-length>
|
||||
// <key-length>
|
||||
// <key>
|
||||
// <compressed-value>
|
||||
//
|
||||
// block-compessed-record ::=
|
||||
// <file-sync-field>
|
||||
// <key-lengths-block-size>
|
||||
// <key-lengths-block>
|
||||
// <keys-block-size>
|
||||
// <keys-block>
|
||||
// <value-lengths-block-size>
|
||||
// <value-lengths-block>
|
||||
// <values-block-size>
|
||||
// <values-block>
|
||||
//
|
||||
// record-length := Int
|
||||
// key-length := Int
|
||||
// keys-lengths-block-size> := Int
|
||||
// value-lengths-block-size> := Int
|
||||
//
|
||||
// keys-block :: = Byte[keys-block-size]
|
||||
// values-block :: = Byte[values-block-size]
|
||||
//
|
||||
// -- The key-lengths and value-lengths blocks are are a sequence of lengths encoded
|
||||
// -- in ZeroCompressedInteger (VInt) format.
|
||||
//
|
||||
// key-lengths-block :: = Byte[key-lengths-block-size]
|
||||
// value-lengths-block :: = Byte[value-lengths-block-size]
|
||||
//
|
||||
// Byte ::= An eight-bit byte
|
||||
//
|
||||
// VInt ::= Variable length integer. The high-order bit of each byte
|
||||
// indicates whether more bytes remain to be read. The low-order seven
|
||||
// bits are appended as increasingly more significant bits in the
|
||||
// resulting integer value.
|
||||
//
|
||||
// Int ::= A four-byte integer in big-endian format.
|
||||
//
|
||||
// Text ::= VInt, Chars (Length prefixed UTF-8 characters)
|
||||
|
||||
class HdfsSequenceScanner : public HdfsScanner {
|
||||
public:
|
||||
HdfsSequenceScanner(HdfsScanNode* scan_node, const TupleDescriptor* tuple_desc,
|
||||
Tuple* template_tuple, MemPool* tuple_pool);
|
||||
|
||||
virtual Status Prepare(RuntimeState* state, ByteStream* byte_stream);
|
||||
virtual Status GetNext(RuntimeState* state, RowBatch* row_batch, bool* eosr);
|
||||
|
||||
private:
|
||||
// Sync indicator
|
||||
const static int SYNC_MARKER = -1;
|
||||
|
||||
// Size of the sync hash field
|
||||
const static int SYNC_HASH_SIZE = 16;
|
||||
|
||||
// The key class name located in the SeqFile Header.
|
||||
// This is always "org.apache.hadoop.io.BytesWritable"
|
||||
static const char* const SEQFILE_KEY_CLASS_NAME;
|
||||
|
||||
// The value class name located in the SeqFile Header.
|
||||
// This is always "org.apache.hadoop.io.Text"
|
||||
static const char* const SEQFILE_VALUE_CLASS_NAME;
|
||||
|
||||
// The four byte SeqFile version header present at the beginning of every
|
||||
// SeqFile file: {'S', 'E', 'Q', 6}
|
||||
static const uint8_t SEQFILE_VERSION_HEADER[4];
|
||||
|
||||
// The key should always be 4 bytes.
|
||||
static const int SEQFILE_KEY_LENGTH;
|
||||
|
||||
// The names of the Codecs we support.
|
||||
static const char* const SEQFILE_DEFAULT_COMPRESSION;
|
||||
static const char* const SEQFILE_GZIP_COMPRESSION;
|
||||
static const char* const SEQFILE_BZIP2_COMPRESSION;
|
||||
static const char* const SEQFILE_SNAPPY_COMPRESSION;
|
||||
|
||||
// Size to read when searching for the first record in a split
|
||||
// This probably ought to be a derived number from the environment.
|
||||
const static int FILE_BLOCK_SIZE = 4096;
|
||||
|
||||
// Initialises any state required at the beginning of a new scan range.
|
||||
// If not at the begining of the file it will trigger a search for the
|
||||
// next sync block, where the scan will start.
|
||||
virtual Status InitCurrentScanRange(RuntimeState* state,
|
||||
HdfsScanRange* scan_range, ByteStream* byte_stream);
|
||||
|
||||
// Writes the intermediate parsed data in to slots, outputting
|
||||
// tuples to row_batch as they complete.
|
||||
// Input Parameters:
|
||||
// state: Runtime state into which we log errors
|
||||
// row_batch: Row batch into which to write new tuples
|
||||
// num_fields: Total number of fields contained in parsed_data_
|
||||
// Input/Output Parameters
|
||||
// row_idx: Index of current row in row_batch.
|
||||
Status WriteFields(RuntimeState* state, RowBatch*
|
||||
row_batch, int num_fields, int* row_idx);
|
||||
|
||||
// Find the first record of a scan range.
|
||||
// If the scan range is not at the beginning of the file then this is called to
|
||||
// move the buffered_byte_stream_ seek point to before the next sync field.
|
||||
// If there is none present then the buffered_byte_stream_ will be beyond the
|
||||
// end of the scan range and the scan will end.
|
||||
Status FindFirstRecord(RuntimeState *state);
|
||||
|
||||
// Read the current Sequence file header from the begining of the file.
|
||||
// Verifies:
|
||||
// version number
|
||||
// key and data classes
|
||||
// Sets:
|
||||
// is_compressed_
|
||||
// is_blk_compressed_
|
||||
// compression_codec_
|
||||
// sync_
|
||||
Status ReadFileHeader();
|
||||
|
||||
// Read the Sequence file Header Metadata section in the current file.
|
||||
// We don't use this information, so it is just skipped.
|
||||
Status ReadFileHeaderMetadata();
|
||||
|
||||
// Read and validate a RowGroup sync field.
|
||||
Status ReadSync();
|
||||
|
||||
// Read the record header, return if there was a sync block.
|
||||
// Sets:
|
||||
// current_block_length_
|
||||
Status ReadBlockHeader(bool* sync);
|
||||
|
||||
// Find first record in a scan range.
|
||||
// Sets the current_byte_stream_ to this record.
|
||||
Status FindFirstRecord();
|
||||
|
||||
// Read compressed blocks and iterate through the records in each block.
|
||||
// Output:
|
||||
// record_ptr: ponter to the record.
|
||||
// record_len: length of the record
|
||||
// eors: set to true if we are at the end of the scan range.
|
||||
Status GetRecordFromCompressedBlock(RuntimeState *state,
|
||||
char** record_ptr, int64_t* record_len, bool* eors);
|
||||
|
||||
// Read compressed or uncompressed records from the byte stream into memory
|
||||
// in unparsed_data_buffer_pool_.
|
||||
// Output:
|
||||
// record_ptr: ponter to the record.
|
||||
// record_len: length of the record
|
||||
// eors: set to true if we are at the end of the scan range.
|
||||
Status GetRecord(char** record_ptr, int64_t* record_len, bool* eosr);
|
||||
|
||||
// Read a compressed block.
|
||||
// Decompress to unparsed_data_buffer_ allocated from unparsed_data_buffer_pool_.
|
||||
Status ReadCompressedBlock(RuntimeState *state);
|
||||
|
||||
// sets decompress_block_function_ by reading the compression_codec_.
|
||||
Status SetCompression();
|
||||
|
||||
// read and verify a sync block.
|
||||
// report_error:if false we are scanning for the begining of a range and
|
||||
// we don't want to report errors.
|
||||
// verified: output true if there was a correct sync hash.
|
||||
Status CheckSync(bool report_error, bool *verified);
|
||||
|
||||
// a buffered byte stream to wrap the stream we are passed.
|
||||
boost::scoped_ptr<BufferedByteStream> buffered_byte_stream_;
|
||||
|
||||
// Helper class for picking fields and rows from delimited text.
|
||||
boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
|
||||
std::vector<DelimitedTextParser::FieldLocation> field_locations_;
|
||||
|
||||
// Parser to find the first record. This uses different delimiters.
|
||||
boost::scoped_ptr<DelimitedTextParser> find_first_parser_;
|
||||
|
||||
// Helper class for converting text fields to internal types.
|
||||
boost::scoped_ptr<TextConverter> text_converter_;
|
||||
|
||||
// Function pointer to the decompression code for the selected codec.
|
||||
// Uncompresses data from 'in' to 'out'.
|
||||
// Sets too_small to true if output_length is not big enought to hold uncompress the
|
||||
// data.
|
||||
Status (*decompress_block_function_) (int input_length, char* in,
|
||||
int output_length, char* out, bool* too_small);
|
||||
|
||||
// Runtime state for reporting file parsing errors.
|
||||
RuntimeState* runtime_state_;
|
||||
|
||||
// The original byte stream we are passed.
|
||||
ByteStream* unbuffered_byte_stream_;
|
||||
|
||||
// The sync hash read in from the file header.
|
||||
char sync_[SYNC_HASH_SIZE];
|
||||
|
||||
// File compression or not.
|
||||
bool is_compressed_;
|
||||
// Block compression or not.
|
||||
bool is_blk_compressed_;
|
||||
|
||||
// Compression codec specified in the Sequence file Header as a SerDe Text.
|
||||
std::vector<char> compression_codec_;
|
||||
|
||||
// Location (file name) of previous scan range.
|
||||
std::string previous_location_;
|
||||
|
||||
// Byte offset of the scan range.
|
||||
int end_of_scan_range_;
|
||||
|
||||
// Length of the current sequence file block (or record).
|
||||
int current_block_length_;
|
||||
|
||||
// Length of the current key. This should always be SEQFILE_KEY_LENGTH.
|
||||
int current_key_length_;
|
||||
|
||||
// Pool for allocating the unparsed_data_buffer_pool_
|
||||
boost::scoped_ptr<MemPool> unparsed_data_buffer_pool_;
|
||||
|
||||
// Buffer for data read from HDFS or from decompressing the HDFS data.
|
||||
char* unparsed_data_buffer_;
|
||||
|
||||
// Size of the unparsed_data_buffer_.
|
||||
int64_t unparsed_data_buffer_size_;
|
||||
|
||||
// Number of buffered records unparsed_data_buffer_ from block compressed data.
|
||||
int64_t num_buffered_records_in_compressed_block_;
|
||||
|
||||
// Next record from block compressed data.
|
||||
char* next_record_in_compressed_block_;
|
||||
|
||||
// Temporary buffer used for reading headers and compressed data.
|
||||
// It will grow to be big enough for the largest compressed record or block.
|
||||
std::vector<char> scratch_buf_;
|
||||
};
|
||||
|
||||
} // namespace impala
|
||||
|
||||
#endif // IMPALA_EXEC_HDFS_SEQUENCE_SCANNER_H
|
||||
@@ -2,13 +2,13 @@
|
||||
|
||||
#include "runtime/runtime-state.h"
|
||||
#include "exec/hdfs-text-scanner.h"
|
||||
#include "util/string-parser.h"
|
||||
#include "runtime/tuple.h"
|
||||
#include "runtime/row-batch.h"
|
||||
#include "runtime/timestamp-value.h"
|
||||
#include "exec/text-converter.h"
|
||||
#include "util/cpu-info.h"
|
||||
#include "exec/hdfs-scan-node.h"
|
||||
#include "exec/delimited-text-parser.h"
|
||||
#include "exec/text-converter.inline.h"
|
||||
|
||||
using namespace impala;
|
||||
using namespace std;
|
||||
@@ -19,8 +19,8 @@ HdfsTextScanner::HdfsTextScanner(HdfsScanNode* scan_node,
|
||||
boundary_mem_pool_(new MemPool()),
|
||||
boundary_row_(boundary_mem_pool_.get()),
|
||||
boundary_column_(boundary_mem_pool_.get()),
|
||||
column_idx_(0),
|
||||
slot_idx_(0),
|
||||
delimited_text_parser_(NULL),
|
||||
text_converter_(NULL),
|
||||
byte_buffer_pool_(new MemPool()),
|
||||
byte_buffer_ptr_(NULL),
|
||||
@@ -32,10 +32,18 @@ HdfsTextScanner::HdfsTextScanner(HdfsScanNode* scan_node,
|
||||
const HdfsTableDescriptor* hdfs_table =
|
||||
static_cast<const HdfsTableDescriptor*>(tuple_desc->table_desc());
|
||||
|
||||
tuple_delim_ = hdfs_table->line_delim();
|
||||
field_delim_ = hdfs_table->field_delim();
|
||||
collection_item_delim_ = hdfs_table->collection_delim();
|
||||
escape_char_ = hdfs_table->escape_char();
|
||||
text_converter_.reset(new TextConverter(hdfs_table->escape_char(), tuple_pool_));
|
||||
|
||||
char field_delim = hdfs_table->field_delim();
|
||||
char collection_delim = hdfs_table->collection_delim();
|
||||
if (scan_node_->materialized_slots().size() == 0) {
|
||||
field_delim = '\0';
|
||||
collection_delim = '\0';
|
||||
}
|
||||
delimited_text_parser_.reset(new DelimitedTextParser(scan_node->column_to_slot_index(),
|
||||
scan_node->GetNumPartitionKeys(), scan_node->parse_time_counter(),
|
||||
hdfs_table->line_delim(), field_delim, collection_delim,
|
||||
hdfs_table->escape_char()));
|
||||
}
|
||||
|
||||
Status HdfsTextScanner::InitCurrentScanRange(RuntimeState* state,
|
||||
@@ -49,7 +57,6 @@ Status HdfsTextScanner::InitCurrentScanRange(RuntimeState* state,
|
||||
// entries 0 through N-1 in column_idx_to_slot_idx. If this changes, we will need
|
||||
// another layer of indirection to map text-file column indexes onto the
|
||||
// column_idx_to_slot_idx table used below.
|
||||
column_idx_ = scan_node_->GetNumPartitionKeys();
|
||||
slot_idx_ = 0;
|
||||
|
||||
// Pre-load byte buffer with size of entire range, if possible
|
||||
@@ -59,18 +66,18 @@ Status HdfsTextScanner::InitCurrentScanRange(RuntimeState* state,
|
||||
|
||||
boundary_column_.Clear();
|
||||
boundary_row_.Clear();
|
||||
delimited_text_parser_->ParserReset();
|
||||
|
||||
// Offset may not point to tuple boundary
|
||||
if (scan_range->offset != 0) {
|
||||
int first_tuple_offset = FindFirstTupleStart(byte_buffer_, byte_buffer_read_size_);
|
||||
int first_tuple_offset =
|
||||
delimited_text_parser_->FindFirstTupleStart(byte_buffer_, byte_buffer_read_size_);
|
||||
DCHECK_LE(first_tuple_offset, min(state->file_buffer_size(),
|
||||
current_range_remaining_len_));
|
||||
current_range_remaining_len_));
|
||||
byte_buffer_ptr_ += first_tuple_offset;
|
||||
current_range_remaining_len_ -= first_tuple_offset;
|
||||
}
|
||||
|
||||
last_char_is_escape_ = false;
|
||||
current_column_has_escape_ = false;
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
@@ -88,7 +95,7 @@ Status HdfsTextScanner::FillByteBuffer(RuntimeState* state, int64_t size) {
|
||||
{
|
||||
COUNTER_SCOPED_TIMER(scan_node_->scanner_timer());
|
||||
RETURN_IF_ERROR(current_byte_stream_->Read(byte_buffer_, read_size,
|
||||
&byte_buffer_read_size_));
|
||||
&byte_buffer_read_size_));
|
||||
}
|
||||
byte_buffer_end_ = byte_buffer_ + byte_buffer_read_size_;
|
||||
byte_buffer_ptr_ = byte_buffer_;
|
||||
@@ -101,106 +108,19 @@ Status HdfsTextScanner::Prepare(RuntimeState* state, ByteStream* byte_stream) {
|
||||
|
||||
current_range_remaining_len_ = 0;
|
||||
|
||||
text_converter_.reset(new TextConverter(escape_char_, tuple_pool_));
|
||||
|
||||
// Allocate the scratch space for two pass parsing. The most fields we can go
|
||||
// through in one parse pass is the batch size (tuples) * the number of fields per tuple
|
||||
// TODO: This should probably be based on L2/L3 cache sizes (as should the batch size)
|
||||
field_locations_.resize(state->batch_size() * scan_node_->materialized_slots().size());
|
||||
|
||||
// Initialize the sse search registers.
|
||||
// TODO: is this safe to do in prepare? Not sure if the compiler/system
|
||||
// will manage these registers for us.
|
||||
char tmp[SSEUtil::CHARS_PER_128_BIT_REGISTER];
|
||||
memset(tmp, 0, sizeof(tmp));
|
||||
tmp[0] = tuple_delim_;
|
||||
xmm_tuple_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
tmp[0] = escape_char_;
|
||||
xmm_escape_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
tmp[0] = field_delim_;
|
||||
tmp[1] = collection_item_delim_;
|
||||
xmm_field_search_ = _mm_loadu_si128(reinterpret_cast<__m128i*>(tmp));
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Find the start of the first full tuple in buffer by looking for the end of
|
||||
// the previous tuple.
|
||||
// TODO: most of this is not tested. We need some tailored data to exercise the boundary
|
||||
// cases
|
||||
int HdfsTextScanner::FindFirstTupleStart(char* buffer, int len) {
|
||||
int tuple_start = 0;
|
||||
char* buffer_start = buffer;
|
||||
while (tuple_start < len) {
|
||||
if (CpuInfo::Instance()->IsSupported(CpuInfo::SSE4_2)) {
|
||||
__m128i xmm_buffer, xmm_tuple_mask;
|
||||
while (len - tuple_start >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
|
||||
// TODO: can we parallelize this as well? Are there multiple sse execution units?
|
||||
// Load the next 16 bytes into the xmm register and do strchr for the
|
||||
// tuple delimiter.
|
||||
xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(buffer));
|
||||
xmm_tuple_mask = _mm_cmpistrm(xmm_tuple_search_, xmm_buffer, SSEUtil::STRCHR_MODE);
|
||||
int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
|
||||
if (tuple_mask != 0) {
|
||||
for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
|
||||
if ((tuple_mask & SSEUtil::SSE_BITMASK[i]) != 0) {
|
||||
tuple_start += i + 1;
|
||||
buffer += i + 1;
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
}
|
||||
tuple_start += SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
buffer += SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
}
|
||||
} else {
|
||||
for (int i = tuple_start; i < len; ++i) {
|
||||
char c = *buffer++;
|
||||
if (c == tuple_delim_) {
|
||||
tuple_start = i + 1;
|
||||
goto end;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
end:
|
||||
if (escape_char_ != '\0') {
|
||||
// Scan backwards for escape characters. We do this after
|
||||
// finding the tuple break rather than during the (above)
|
||||
// forward scan to make the forward scan faster. This will
|
||||
// perform worse if there are many characters right before the
|
||||
// tuple break that are all escape characters, but that is
|
||||
// unlikely.
|
||||
int num_escape_chars = 0;
|
||||
int before_tuple_end = tuple_start - 2;
|
||||
for (; before_tuple_end >= 0; --before_tuple_end) {
|
||||
if (buffer_start[before_tuple_end] == escape_char_) {
|
||||
++num_escape_chars;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
// TODO: This sucks. All the preceding characters before the tuple delim were
|
||||
// escape characters. We need to read from the previous block to see what to do.
|
||||
DCHECK_GT(before_tuple_end, 0);
|
||||
|
||||
// An even number of escape characters means they cancel out and this tuple break
|
||||
// is *not* escaped.
|
||||
if (num_escape_chars % 2 == 0) {
|
||||
return tuple_start;
|
||||
}
|
||||
} else {
|
||||
return tuple_start;
|
||||
}
|
||||
}
|
||||
return tuple_start;
|
||||
}
|
||||
|
||||
Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool* eosr) {
|
||||
AllocateTupleBuffer(row_batch);
|
||||
// Index into current row in row_batch.
|
||||
int row_idx = RowBatch::INVALID_ROW_INDEX;
|
||||
int first_materialised_col_idx = scan_node_->GetNumPartitionKeys();
|
||||
char* col_start = NULL;
|
||||
|
||||
// This loop contains a small state machine:
|
||||
// 1. byte_buffer_ptr_ != byte_buffer_end_: no need to read more, process what's in
|
||||
@@ -220,25 +140,25 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
// TODO: log an error, we have an incomplete tuple at the end of the file
|
||||
current_range_remaining_len_ = 0;
|
||||
slot_idx_ = 0;
|
||||
column_idx_ = first_materialised_col_idx;
|
||||
delimited_text_parser_->ParserReset();
|
||||
boundary_column_.Clear();
|
||||
byte_buffer_ptr_ = NULL;
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
if (current_range_remaining_len_ == 0) {
|
||||
// Check if a tuple is straddling this block and the next:
|
||||
// 1. boundary_column_ is not empty
|
||||
// 2. column_idx_ != first_materialised_col_idx if we are halfway through
|
||||
// reading a tuple
|
||||
// 2. if we are halfway through reading a tuple: !AtStart.
|
||||
// 3. We have are in the middle of the first column
|
||||
// We need to continue scanning until the end of the tuple. Note that
|
||||
// boundary_column_ will be empty if we are on a column boundary, but could still
|
||||
// be inside a tuple. Similarly column_idx_ could be first_materialised_col_idx
|
||||
// if we are in the middle of reading the first column. Therefore we need both
|
||||
// checks.
|
||||
// We cannot use slot_idx, since that is incremented only if we are
|
||||
// materialising slots, which is not true for e.g. count(*)
|
||||
// boundary_column_ will be empty if we are on a column boundary,
|
||||
// but could still be inside a tuple. Similarly column_idx_ could be
|
||||
// first_materialised_col_idx if we are in the middle of reading the first
|
||||
// column. Therefore we need both checks.
|
||||
// TODO: test that hits this condition.
|
||||
if (!boundary_column_.Empty() || column_idx_ != first_materialised_col_idx) {
|
||||
if (!boundary_column_.Empty() || !delimited_text_parser_->AtTupleStart() ||
|
||||
(col_start != NULL && col_start != byte_buffer_ptr_)) {
|
||||
current_range_remaining_len_ = -1;
|
||||
continue;
|
||||
}
|
||||
@@ -262,7 +182,6 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
int previous_num_rows = num_rows_returned_;
|
||||
// With two pass approach, we need to save some of the state before any of the file
|
||||
// was parsed
|
||||
char* col_start = NULL;
|
||||
char* line_start = byte_buffer_ptr_;
|
||||
int num_tuples = 0;
|
||||
int num_fields = 0;
|
||||
@@ -272,8 +191,10 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
if (current_range_remaining_len_ < 0) {
|
||||
max_tuples = 1;
|
||||
}
|
||||
RETURN_IF_ERROR(ParseFileBuffer(max_tuples, &num_tuples, &num_fields, &col_start));
|
||||
|
||||
char* previous_buffer_ptr = byte_buffer_ptr_;
|
||||
RETURN_IF_ERROR(delimited_text_parser_->ParseFieldLocations(max_tuples,
|
||||
byte_buffer_end_ - byte_buffer_ptr_, &byte_buffer_ptr_,
|
||||
&field_locations_, &num_tuples, &num_fields, &col_start));
|
||||
int bytes_processed = byte_buffer_ptr_ - line_start;
|
||||
current_range_remaining_len_ -= bytes_processed;
|
||||
|
||||
@@ -281,18 +202,28 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
|
||||
if (scan_node_->materialized_slots().size() != 0) {
|
||||
if (num_fields != 0) {
|
||||
// There can be one partial tuple which returned no more fields from this buffer.
|
||||
DCHECK_LE(num_tuples, num_fields + 1);
|
||||
if (!boundary_column_.Empty()) {
|
||||
CopyBoundaryField(&field_locations_[0]);
|
||||
boundary_column_.Clear();
|
||||
}
|
||||
RETURN_IF_ERROR(WriteFields(state, row_batch, num_fields, &row_idx, &line_start));
|
||||
}
|
||||
} else if (col_start != previous_buffer_ptr) {
|
||||
// If we saw any delimiters col_start will move, clear the boundry_row_.
|
||||
boundary_row_.Clear();
|
||||
}
|
||||
} else if (num_tuples != 0) {
|
||||
// If we are doing count(*) then we return tuples only containing partition keys
|
||||
boundary_row_.Clear();
|
||||
line_start = byte_buffer_ptr_;
|
||||
RETURN_IF_ERROR(WriteTuples(state, row_batch, num_tuples, &row_idx));
|
||||
}
|
||||
|
||||
// Cannot reuse file buffer if there are non-copied string slots materialized
|
||||
// TODO: If the tuple data contains very sparse string slots, we waste a lot of memory.
|
||||
// Instead, we should consider copying the tuples to a compact new buffer in this
|
||||
// case.
|
||||
// TODO: If the tuple data contains very sparse string slots, we waste a lot of
|
||||
// memory. Instead, we should consider copying the tuples to a compact new buffer
|
||||
// in this case.
|
||||
if (num_rows_returned_ > previous_num_rows && has_string_slots_) {
|
||||
reuse_byte_buffer_ = false;
|
||||
}
|
||||
@@ -301,8 +232,8 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
break;
|
||||
}
|
||||
|
||||
// Save contents that are split across files
|
||||
if (col_start != byte_buffer_ptr_) {
|
||||
// Save contents that are split across buffers if we are going to return this column
|
||||
if (col_start != byte_buffer_ptr_ && delimited_text_parser_->ReturnCurrentColumn()) {
|
||||
boundary_column_.Append(col_start, byte_buffer_ptr_ - col_start);
|
||||
boundary_row_.Append(line_start, byte_buffer_ptr_ - line_start);
|
||||
}
|
||||
@@ -324,12 +255,12 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
// GetNext() call.
|
||||
if (row_batch->IsFull()) {
|
||||
*eosr = false;
|
||||
break;
|
||||
|
||||
DCHECK(delimited_text_parser_->AtTupleStart());
|
||||
return Status::OK;
|
||||
}
|
||||
}
|
||||
|
||||
DCHECK_EQ(column_idx_, first_materialised_col_idx);
|
||||
|
||||
// This is the only non-error return path for this function. There are
|
||||
// two return paths:
|
||||
// 1. EOS: limit is reached or scan range is complete
|
||||
@@ -341,11 +272,12 @@ Status HdfsTextScanner::GetNext(RuntimeState* state, RowBatch* row_batch, bool*
|
||||
}
|
||||
row_batch->tuple_data_pool()->AcquireData(tuple_pool_, !*eosr);
|
||||
|
||||
DCHECK(delimited_text_parser_->AtTupleStart());
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
void HdfsTextScanner::ReportRowParseError(RuntimeState* state, char* line_start,
|
||||
int len) {
|
||||
int len) {
|
||||
++num_errors_in_file_;
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream() << "file: " << current_byte_stream_->GetLocation() << endl;
|
||||
@@ -377,8 +309,20 @@ Status HdfsTextScanner::WriteFields(RuntimeState* state, RowBatch* row_batch,
|
||||
|
||||
// Loop through all the parsed_data and parse out the values to slots
|
||||
for (int n = 0; n < num_fields; ++n) {
|
||||
int need_escape = false;
|
||||
int len = field_locations_[n].len;
|
||||
if (len < 0) {
|
||||
len = -len;
|
||||
need_escape = true;
|
||||
}
|
||||
next_line_offset += (len + 1);
|
||||
|
||||
WriteSlots(state, n, &next_line_offset);
|
||||
boundary_row_.Clear();
|
||||
if (!text_converter_->WriteSlot(state,
|
||||
scan_node_->materialized_slots()[slot_idx_].second, tuple_,
|
||||
field_locations_[n].start, len, false, need_escape).ok()) {
|
||||
error_in_row_ = true;
|
||||
}
|
||||
|
||||
// If slot_idx_ equals the number of materialized slots, we have completed
|
||||
// parsing the tuple. At this point we can:
|
||||
@@ -444,90 +388,7 @@ Status HdfsTextScanner::WriteFields(RuntimeState* state, RowBatch* row_batch,
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
|
||||
Status HdfsTextScanner::WriteSlots(RuntimeState* state, int tuple_idx,
|
||||
int* next_line_offset) {
|
||||
boundary_row_.Clear();
|
||||
SlotDescriptor* slot_desc = scan_node_->materialized_slots()[slot_idx_].second;
|
||||
char* data = field_locations_[tuple_idx].start;
|
||||
int len = field_locations_[tuple_idx].len;
|
||||
bool need_escape = false;
|
||||
if (len < 0) {
|
||||
len = -len;
|
||||
need_escape = true;
|
||||
}
|
||||
next_line_offset += (len + 1);
|
||||
if (len == 0) {
|
||||
tuple_->SetNull(slot_desc->null_indicator_offset());
|
||||
} else {
|
||||
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
|
||||
void* slot = tuple_->GetSlot(slot_desc->tuple_offset());
|
||||
|
||||
// Parse the raw-text data. At this point:
|
||||
switch (slot_desc->type()) {
|
||||
case TYPE_STRING:
|
||||
reinterpret_cast<StringValue*>(slot)->ptr = data;
|
||||
reinterpret_cast<StringValue*>(slot)->len = len;
|
||||
if (need_escape) {
|
||||
text_converter_->UnescapeString(reinterpret_cast<StringValue*>(slot));
|
||||
}
|
||||
break;
|
||||
case TYPE_BOOLEAN:
|
||||
*reinterpret_cast<bool*>(slot) =
|
||||
StringParser::StringToBool(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_TINYINT:
|
||||
*reinterpret_cast<int8_t*>(slot) =
|
||||
StringParser::StringToInt<int8_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_SMALLINT:
|
||||
*reinterpret_cast<int16_t*>(slot) =
|
||||
StringParser::StringToInt<int16_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_INT:
|
||||
*reinterpret_cast<int32_t*>(slot) =
|
||||
StringParser::StringToInt<int32_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_BIGINT:
|
||||
*reinterpret_cast<int64_t*>(slot) =
|
||||
StringParser::StringToInt<int64_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_FLOAT:
|
||||
*reinterpret_cast<float*>(slot) =
|
||||
StringParser::StringToFloat<float>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_DOUBLE:
|
||||
*reinterpret_cast<double*>(slot) =
|
||||
StringParser::StringToFloat<double>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_TIMESTAMP: {
|
||||
string strbuf(data, len);
|
||||
*reinterpret_cast<TimestampValue*>(slot) = TimestampValue(strbuf);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: add warning for overflow case
|
||||
if (parse_result == StringParser::PARSE_FAILURE) {
|
||||
error_in_row_ = true;
|
||||
tuple_->SetNull(slot_desc->null_indicator_offset());
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream() << "Error converting column: "
|
||||
<< slot_desc->col_pos() << " TO "
|
||||
// TODO: num_partition_keys_ no longer visible to scanner.
|
||||
// << slot_desc->col_pos() - num_partition_keys_ << " TO "
|
||||
<< TypeToString(slot_desc->type()) << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
void HdfsTextScanner::CopyBoundaryField(FieldLocations* data) {
|
||||
void HdfsTextScanner::CopyBoundaryField(DelimitedTextParser::FieldLocation* data) {
|
||||
const int total_len = data->len + boundary_column_.Size();
|
||||
char* str_data = reinterpret_cast<char*>(tuple_pool_->Allocate(total_len));
|
||||
memcpy(str_data, boundary_column_.str().ptr, boundary_column_.Size());
|
||||
@@ -536,212 +397,3 @@ void HdfsTextScanner::CopyBoundaryField(FieldLocations* data) {
|
||||
data->len = total_len;
|
||||
}
|
||||
|
||||
// Updates the values in the field and tuple masks, escaping them if necessary.
|
||||
// If the character at n is an escape character, then delimiters(tuple/field/escape
|
||||
// characters) at n+1 don't count.
|
||||
inline void ProcessEscapeMask(int escape_mask, bool* last_char_is_escape, int* field_mask,
|
||||
int* tuple_mask) {
|
||||
// Escape characters can escape escape characters.
|
||||
bool first_char_is_escape = *last_char_is_escape;
|
||||
bool escape_next = first_char_is_escape;
|
||||
for (int i = 0; i < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++i) {
|
||||
if (escape_next) {
|
||||
escape_mask &= ~SSEUtil::SSE_BITMASK[i];
|
||||
}
|
||||
escape_next = escape_mask & SSEUtil::SSE_BITMASK[i];
|
||||
}
|
||||
|
||||
// Remember last character for the next iteration
|
||||
*last_char_is_escape = escape_mask &
|
||||
SSEUtil::SSE_BITMASK[SSEUtil::CHARS_PER_128_BIT_REGISTER - 1];
|
||||
|
||||
// Shift escape mask up one so they match at the same bit index as the tuple and field mask
|
||||
// (instead of being the character before) and set the correct first bit
|
||||
escape_mask = escape_mask << 1 | first_char_is_escape;
|
||||
|
||||
// If escape_mask[n] is true, then tuple/field_mask[n] is escaped
|
||||
*tuple_mask &= ~escape_mask;
|
||||
*field_mask &= ~escape_mask;
|
||||
}
|
||||
|
||||
// SSE optimized raw text file parsing. SSE4_2 added an instruction (with 3 modes) for
|
||||
// text processing. The modes mimic strchr, strstr and strcmp. For text parsing, we can
|
||||
// leverage the strchr functionality.
|
||||
//
|
||||
// The instruction operates on two sse registers:
|
||||
// - the needle (what you are searching for)
|
||||
// - the haystack (where you are searching in)
|
||||
// Both registers can contain up to 16 characters. The result is a 16-bit mask with a bit
|
||||
// set for each character in the haystack that matched any character in the needle.
|
||||
// For example:
|
||||
// Needle = 'abcd000000000000' (we're searching for any a's, b's, c's d's)
|
||||
// Haystack = 'asdfghjklhjbdwwc' (the raw string)
|
||||
// Result = '101000000001101'
|
||||
Status HdfsTextScanner::ParseFileBuffer(int max_tuples, int* num_tuples, int* num_fields,
|
||||
char** column_start) {
|
||||
COUNTER_SCOPED_TIMER(scan_node_->parse_time_counter());
|
||||
|
||||
// Start of this batch.
|
||||
*column_start = byte_buffer_ptr_;
|
||||
|
||||
// To parse using SSE, we:
|
||||
// 1. Load into different sse registers the different characters we need to search for
|
||||
// - tuple breaks, field breaks, escape characters
|
||||
// 2. Load 16 characters at a time into the sse register
|
||||
// 3. Use the SSE instruction to do strchr on those 16 chars, the result is a bitmask
|
||||
// 4. Compute the bitmask for tuple breaks, field breaks and escape characters.
|
||||
// 5. If there are escape characters, fix up the matching masked bits in the field/tuple mask
|
||||
// 6. Go through the mask bit by bit and write the parsed data.
|
||||
|
||||
// xmm registers:
|
||||
// - xmm_buffer: the register holding the current (16 chars) we're working on from the
|
||||
// - file
|
||||
// - xmm_tuple_search_: the tuple search register. Only contains the tuple_delim char.
|
||||
// - xmm_field_search_: the field search register. Contains field delim and
|
||||
// collection_item delim_char
|
||||
// - xmm_escape_search_: the escape search register. Only contains escape char
|
||||
// - xmm_tuple_mask: the result of doing strchr for the tuple delim
|
||||
// - xmm_field_mask: the result of doing strchr for the field delim
|
||||
// - xmm_escape_mask: the result of doing strchr for the escape char
|
||||
__m128i xmm_buffer, xmm_tuple_mask, xmm_field_mask, xmm_escape_mask;
|
||||
|
||||
// Length remaining of buffer to process
|
||||
int remaining_len = byte_buffer_end_ - byte_buffer_ptr_;
|
||||
|
||||
const vector<int>& column_idx_to_slot_idx_ = scan_node_->column_to_slot_index();
|
||||
|
||||
if (CpuInfo::Instance()->IsSupported(CpuInfo::SSE4_2)) {
|
||||
while (remaining_len >= SSEUtil::CHARS_PER_128_BIT_REGISTER) {
|
||||
// Load the next 16 bytes into the xmm register
|
||||
xmm_buffer = _mm_loadu_si128(reinterpret_cast<__m128i*>(byte_buffer_ptr_));
|
||||
|
||||
// Do the strchr for tuple and field breaks
|
||||
// TODO: can we parallelize this as well? Are there multiple sse execution units?
|
||||
xmm_tuple_mask = _mm_cmpistrm(xmm_tuple_search_, xmm_buffer, SSEUtil::STRCHR_MODE);
|
||||
xmm_field_mask = _mm_cmpistrm(xmm_field_search_, xmm_buffer, SSEUtil::STRCHR_MODE);
|
||||
|
||||
// The strchr sse instruction returns the result in the lower bits of the sse
|
||||
// register. Since we only process 16 characters at a time, only the lower 16 bits
|
||||
// can contain non-zero values.
|
||||
// _mm_extract_epi16 will extract 16 bits out of the xmm register. The second
|
||||
// parameter specifies which 16 bits to extract (0 for the lowest 16 bits).
|
||||
int tuple_mask = _mm_extract_epi16(xmm_tuple_mask, 0);
|
||||
int field_mask = _mm_extract_epi16(xmm_field_mask, 0);
|
||||
int escape_mask = 0;
|
||||
|
||||
// If the table does not use escape characters, skip processing for it.
|
||||
if (escape_char_ != '\0') {
|
||||
xmm_escape_mask = _mm_cmpistrm(xmm_escape_search_, xmm_buffer,
|
||||
SSEUtil::STRCHR_MODE);
|
||||
escape_mask = _mm_extract_epi16(xmm_escape_mask, 0);
|
||||
ProcessEscapeMask(escape_mask, &last_char_is_escape_, &field_mask, &tuple_mask);
|
||||
}
|
||||
|
||||
// Tuple delims are automatically field delims
|
||||
field_mask |= tuple_mask;
|
||||
|
||||
if (field_mask != 0) {
|
||||
// Loop through the mask and find the tuple/column offsets
|
||||
for (int n = 0; n < SSEUtil::CHARS_PER_128_BIT_REGISTER; ++n) {
|
||||
if (escape_mask != 0) {
|
||||
current_column_has_escape_ =
|
||||
current_column_has_escape_ || (escape_mask & SSEUtil::SSE_BITMASK[n]);
|
||||
}
|
||||
|
||||
if (field_mask & SSEUtil::SSE_BITMASK[n]) {
|
||||
char* column_end = byte_buffer_ptr_ + n;
|
||||
// TODO: apparently there can be columns not in the schema which should be
|
||||
// ignored. This does not handle that.
|
||||
if (column_idx_to_slot_idx_[column_idx_] != HdfsScanNode::SKIP_COLUMN) {
|
||||
DCHECK_LT(*num_fields, field_locations_.size());
|
||||
// Found a column that needs to be parsed, write the start/len to
|
||||
// 'parsed_data_'
|
||||
const int len = column_end - *column_start;
|
||||
field_locations_[*num_fields].start = *column_start;
|
||||
if (!current_column_has_escape_) {
|
||||
field_locations_[*num_fields].len = len;
|
||||
} else {
|
||||
field_locations_[*num_fields].len = -len;
|
||||
}
|
||||
if (!boundary_column_.Empty()) {
|
||||
CopyBoundaryField(&field_locations_[*num_fields]);
|
||||
}
|
||||
++(*num_fields);
|
||||
}
|
||||
current_column_has_escape_ = false;
|
||||
boundary_column_.Clear();
|
||||
*column_start = column_end + 1;
|
||||
++column_idx_;
|
||||
}
|
||||
|
||||
if (tuple_mask & SSEUtil::SSE_BITMASK[n]) {
|
||||
column_idx_ = scan_node_->GetNumPartitionKeys();
|
||||
++(*num_tuples);
|
||||
if (*num_tuples == max_tuples) {
|
||||
byte_buffer_ptr_ += (n + 1);
|
||||
last_char_is_escape_ = false;
|
||||
return Status::OK;
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
current_column_has_escape_ = (current_column_has_escape_ || escape_mask);
|
||||
}
|
||||
|
||||
remaining_len -= SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
byte_buffer_ptr_ += SSEUtil::CHARS_PER_128_BIT_REGISTER;
|
||||
}
|
||||
}
|
||||
|
||||
// Handle the remaining characters
|
||||
while (remaining_len > 0) {
|
||||
bool new_tuple = false;
|
||||
bool new_col = false;
|
||||
|
||||
if (!last_char_is_escape_) {
|
||||
if (*byte_buffer_ptr_ == tuple_delim_) {
|
||||
new_tuple = true;
|
||||
new_col = true;
|
||||
} else if (*byte_buffer_ptr_ == field_delim_
|
||||
|| *byte_buffer_ptr_ == collection_item_delim_) {
|
||||
new_col = true;
|
||||
}
|
||||
}
|
||||
if (*byte_buffer_ptr_ == escape_char_) {
|
||||
current_column_has_escape_ = true;
|
||||
last_char_is_escape_ = !last_char_is_escape_;
|
||||
} else {
|
||||
last_char_is_escape_ = false;
|
||||
}
|
||||
|
||||
if (new_col) {
|
||||
if (column_idx_to_slot_idx_[column_idx_] != HdfsScanNode::SKIP_COLUMN) {
|
||||
DCHECK_LT(*num_fields, field_locations_.size());
|
||||
// Found a column that needs to be parsed, write the start/len to 'parsed_data_'
|
||||
field_locations_[*num_fields].start = *column_start;
|
||||
field_locations_[*num_fields].len = byte_buffer_ptr_ - *column_start;
|
||||
if (current_column_has_escape_) field_locations_[*num_fields].len *= -1;
|
||||
if (!boundary_column_.Empty()) {
|
||||
CopyBoundaryField(&field_locations_[*num_fields]);
|
||||
}
|
||||
++(*num_fields);
|
||||
}
|
||||
boundary_column_.Clear();
|
||||
current_column_has_escape_ = false;
|
||||
*column_start = byte_buffer_ptr_ + 1;
|
||||
++column_idx_;
|
||||
}
|
||||
|
||||
if (new_tuple) {
|
||||
column_idx_ = scan_node_->GetNumPartitionKeys();
|
||||
++(*num_tuples);
|
||||
}
|
||||
|
||||
--remaining_len;
|
||||
++byte_buffer_ptr_;
|
||||
|
||||
if (*num_tuples == max_tuples) return Status::OK;
|
||||
}
|
||||
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#ifndef IMPALA_HDFS_TEXT_SCANNER_H_
|
||||
#define IMPALA_HDFS_TEXT_SCANNER_H_
|
||||
#ifndef IMPALA_EXEC_HDFS_TEXT_SCANNER_H
|
||||
#define IMPALA_EXEC_HDFS_TEXT_SCANNER_H
|
||||
|
||||
#include "exec/hdfs-scanner.h"
|
||||
#include "exec/delimited-text-parser.h"
|
||||
|
||||
namespace impala {
|
||||
|
||||
@@ -22,6 +23,40 @@ class HdfsTextScanner : public HdfsScanner {
|
||||
const static char DELIM_INIT = -1;
|
||||
const static int NEXT_BLOCK_READ_SIZE = 1024; //bytes
|
||||
|
||||
// Prepends field data that was from the previous file buffer (This field straddled two
|
||||
// file buffers). 'data' already contains the pointer/len from the current file buffer,
|
||||
// boundary_column_ contains the beginning of the data from the previous file
|
||||
// buffer. This function will allocate a new string from the tuple pool, concatenate the
|
||||
// two pieces and update 'data' to contain the new pointer/len.
|
||||
void CopyBoundaryField(DelimitedTextParser::FieldLocation* data);
|
||||
|
||||
// Initialises any state required at the beginning of a new scan
|
||||
// range. Here this means resetting escaping state.
|
||||
virtual Status InitCurrentScanRange(RuntimeState* state, HdfsScanRange* scan_range,
|
||||
ByteStream* byte_stream);
|
||||
|
||||
// Writes the intermediate parsed data in to slots, outputting
|
||||
// tuples to row_batch as they complete.
|
||||
// Input Parameters:
|
||||
// state: Runtime state into which we log errors
|
||||
// row_batch: Row batch into which to write new tuples
|
||||
// first_column_idx: The col idx for the raw file associated with parsed_data_[0]
|
||||
// num_fields: Total number of fields contained in parsed_data_
|
||||
// Input/Output Parameters
|
||||
// row_idx: Index of current row in row_batch.
|
||||
// line_start: pointer to within byte_buffer where the current line starts. This is
|
||||
// used for better error reporting
|
||||
Status WriteFields(RuntimeState* state, RowBatch* row_batch, int num_fields,
|
||||
int* row_idx, char** line_start);
|
||||
|
||||
// Appends the current file and line to the RuntimeState's error log (if there's space).
|
||||
// Also, increments num_errors_in_file_.
|
||||
void ReportRowParseError(RuntimeState* state, char* line_start, int len);
|
||||
|
||||
// Reads up to size bytes from byte_stream into byte_buffer_, and
|
||||
// updates byte_buffer_read_size_
|
||||
Status FillByteBuffer(RuntimeState* state, int64_t size);
|
||||
|
||||
// Memory pool for allocations into the boundary row / column
|
||||
boost::scoped_ptr<MemPool> boundary_mem_pool_;
|
||||
|
||||
@@ -33,12 +68,15 @@ class HdfsTextScanner : public HdfsScanner {
|
||||
// Helper string for dealing with columns that span file blocks.
|
||||
StringBuffer boundary_column_;
|
||||
|
||||
// Index to keep track of the current current column in the current file
|
||||
int column_idx_;
|
||||
|
||||
// Index into materialized_slots_ for the next slot to output for the current tuple.
|
||||
int slot_idx_;
|
||||
|
||||
// Helper class for picking fields and rows from delimited text.
|
||||
boost::scoped_ptr<DelimitedTextParser> delimited_text_parser_;
|
||||
|
||||
// Return field locations from the Delimited Text Parser.
|
||||
std::vector<DelimitedTextParser::FieldLocation> field_locations_;
|
||||
|
||||
// Helper class for converting text to other types;
|
||||
boost::scoped_ptr<TextConverter> text_converter_;
|
||||
|
||||
@@ -70,108 +108,10 @@ class HdfsTextScanner : public HdfsScanner {
|
||||
// logged.
|
||||
bool error_in_row_;
|
||||
|
||||
// Intermediate structure used for two pass parsing approach. In the first pass,
|
||||
// FieldLocations structs are filled out and contain where all the fields start and
|
||||
// their lengths. In the second pass, the FieldLocations is used to write out the
|
||||
// slots. We want to keep this struct as small as possible.
|
||||
struct FieldLocations {
|
||||
//start of field
|
||||
char* start;
|
||||
// Encodes the length and whether or not this fields needs to be unescaped.
|
||||
// If len < 0, then the field needs to be unescaped.
|
||||
int len;
|
||||
};
|
||||
std::vector<FieldLocations> field_locations_;
|
||||
|
||||
// SSE(xmm) register containing the tuple search character.
|
||||
__m128i xmm_tuple_search_;
|
||||
|
||||
// SSE(xmm) register containing the field search character.
|
||||
__m128i xmm_field_search_;
|
||||
|
||||
// SSE(xmm) register containing the escape search character.
|
||||
__m128i xmm_escape_search_;
|
||||
|
||||
// Character delimiting tuples.
|
||||
char tuple_delim_;
|
||||
|
||||
// Character delimiting fields (to become slots).
|
||||
char field_delim_;
|
||||
|
||||
// Character delimiting collection items (to become slots).
|
||||
char collection_item_delim_;
|
||||
|
||||
// Escape character.
|
||||
char escape_char_;
|
||||
|
||||
// Whether or not the previous character was the escape character
|
||||
bool last_char_is_escape_;
|
||||
|
||||
// Whether or not the current column has an escape character in it
|
||||
// (and needs to be unescaped)
|
||||
bool current_column_has_escape_;
|
||||
|
||||
// Tracks the number of bytes left to read in the current scan
|
||||
// range. When <= 0, GetNext will prepare to exit.
|
||||
int current_range_remaining_len_;
|
||||
|
||||
// Prepends field data that was from the previous file buffer (This field straddled two
|
||||
// file buffers). 'data' already contains the pointer/len from the current file buffer,
|
||||
// boundary_column_ contains the beginning of the data from the previous file
|
||||
// buffer. This function will allocate a new string from the tuple pool, concatenate the
|
||||
// two pieces and update 'data' to contain the new pointer/len.
|
||||
void CopyBoundaryField(FieldLocations* data);
|
||||
|
||||
// Parses the current file_buffer_ for the field and tuple breaks.
|
||||
// This function will write the field start & len to 'parsed_data_'
|
||||
// which can then be written out to tuples.
|
||||
// This function will use SSE ("Intel x86 instruction set extension
|
||||
// 'Streaming Simd Extension') if the hardware supports SSE4.2
|
||||
// instructions. SSE4.2 added string processing instructions that
|
||||
// allow for processing 16 characters at a time. Otherwise, this
|
||||
// function will walk the file_buffer_ character by character.
|
||||
// Input Parameters:
|
||||
// max_tuples: The maximum number of tuples that should be parsed.
|
||||
// This is used to control how the batching works.
|
||||
// Output Parameters:
|
||||
// num_tuples: Number of tuples parsed
|
||||
// num_fields: Number of materialized fields parsed
|
||||
// col_start: pointer within file_buffer_ where the next field starts
|
||||
Status ParseFileBuffer(int max_tuples, int* num_tuples, int* num_fields,
|
||||
char** column_start);
|
||||
|
||||
// Initialises any state required at the beginning of a new scan
|
||||
// range. Here this means resetting escaping state.
|
||||
virtual Status InitCurrentScanRange(RuntimeState* state, HdfsScanRange* scan_range,
|
||||
ByteStream* byte_stream);
|
||||
|
||||
// Searches for the offset of the first full tuple in the supplied buffer.
|
||||
int FindFirstTupleStart(char* buffer, int len);
|
||||
|
||||
// Writes the intermediate parsed data in to slots, outputting
|
||||
// tuples to row_batch as they complete.
|
||||
// Input Parameters:
|
||||
// state: Runtime state into which we log errors
|
||||
// row_batch: Row batch into which to write new tuples
|
||||
// first_column_idx: The col idx for the raw file associated with parsed_data_[0]
|
||||
// num_fields: Total number of fields contained in parsed_data_
|
||||
// Input/Output Parameters
|
||||
// row_idx: Index of current row in row_batch.
|
||||
// line_start: pointer to within byte_buffer where the current line starts. This is
|
||||
// used for better error reporting
|
||||
Status WriteFields(RuntimeState* state, RowBatch* row_batch, int num_fields,
|
||||
int* row_idx, char** line_start);
|
||||
|
||||
Status WriteSlots(RuntimeState* state, int tuple_idx, int* next_line_offset);
|
||||
|
||||
// Appends the current file and line to the RuntimeState's error log (if there's space).
|
||||
// Also, increments num_errors_in_file_.
|
||||
void ReportRowParseError(RuntimeState* state, char* line_start, int len);
|
||||
|
||||
// Reads up to size bytes from byte_stream into byte_buffer_, and
|
||||
// updates byte_buffer_read_size_
|
||||
Status FillByteBuffer(RuntimeState* state, int64_t size);
|
||||
|
||||
};
|
||||
|
||||
}
|
||||
|
||||
@@ -23,16 +23,10 @@ Status SerDeUtils::ReadBoolean(ByteStream* byte_stream, bool* boolean) {
|
||||
}
|
||||
|
||||
Status SerDeUtils::ReadInt(ByteStream* byte_stream, int32_t* integer) {
|
||||
uint8_t buf[sizeof(int)];
|
||||
char buf[sizeof(int32_t)];
|
||||
RETURN_IF_ERROR(SerDeUtils::ReadBytes(byte_stream, sizeof(int32_t),
|
||||
reinterpret_cast<char*>(&buf)));
|
||||
|
||||
*integer =
|
||||
((buf[0] & 0xff) << 24)
|
||||
| ((buf[1] & 0xff) << 16)
|
||||
| ((buf[2] & 0xff) << 8)
|
||||
| (buf[3] & 0xff);
|
||||
return Status::OK;
|
||||
return SerDeUtils::ReadInt(buf, integer);
|
||||
}
|
||||
|
||||
Status SerDeUtils::ReadVLong(ByteStream* byte_stream, int64_t* vlong) {
|
||||
@@ -145,6 +139,13 @@ Status SerDeUtils::ReadText(ByteStream* byte_stream, std::vector<char>* text) {
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
Status SerDeUtils::SkipText(ByteStream* byte_stream) {
|
||||
int32_t length;
|
||||
RETURN_IF_ERROR(ReadVInt(byte_stream, &length));
|
||||
RETURN_IF_ERROR(SkipBytes(byte_stream, length));
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
std::string SerDeUtils::HexDump(const char* buf, int64_t length) {
|
||||
std::stringstream ss;
|
||||
ss << std::hex;
|
||||
|
||||
@@ -33,6 +33,18 @@ public:
|
||||
// Equivalent to java.io.DataInput.readInt()
|
||||
static Status ReadInt(ByteStream* byte_stream, int32_t* integer);
|
||||
|
||||
// Read an Integer from a buffer.
|
||||
static Status ReadInt(char* in_buf, int32_t* integer) {
|
||||
// TODO: all buffers should be typed to uint8_t*
|
||||
uint8_t* buf = reinterpret_cast<uint8_t*>(in_buf);
|
||||
*integer =
|
||||
(buf[0] << 24)
|
||||
| (buf[1] << 16)
|
||||
| (buf[2] << 8)
|
||||
| buf[3];
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
// Read a variable-length Long value written using Writable serialization.
|
||||
// Ref: org.apache.hadoop.io.WritableUtils.readVLong()
|
||||
static Status ReadVLong(ByteStream* byte_stream, int64_t* vlong);
|
||||
@@ -62,6 +74,9 @@ public:
|
||||
// Ref: org.apache.hadoop.io.WritableUtils.readString()
|
||||
static Status ReadText(ByteStream* byte_stream, std::vector<char>* text);
|
||||
|
||||
// Skip this text object.
|
||||
static Status SkipText(ByteStream* byte_stream);
|
||||
|
||||
// Dump the first length bytes of buf to a Hex string.
|
||||
static std::string HexDump(const char* buf, int64_t length);
|
||||
|
||||
|
||||
@@ -1,10 +1,12 @@
|
||||
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#include "runtime/runtime-state.h"
|
||||
#include "text-converter.h"
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <glog/logging.h>
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/tuple.h"
|
||||
#include "util/string-parser.h"
|
||||
#include "runtime/string-value.h"
|
||||
#include "runtime/timestamp-value.h"
|
||||
#include "runtime/mem-pool.h"
|
||||
@@ -18,103 +20,6 @@ TextConverter::TextConverter(char escape_char, MemPool* var_len_pool)
|
||||
var_len_pool_(var_len_pool) {
|
||||
}
|
||||
|
||||
bool TextConverter::ConvertAndWriteSlotBytes(const char* begin, const char* end, Tuple* tuple,
|
||||
const SlotDescriptor* slot_desc, bool copy_string, bool unescape_string) {
|
||||
// Check for null columns.
|
||||
// The below code implies that unquoted empty strings
|
||||
// such as "...,,..." become NULLs, and not empty strings.
|
||||
if (begin == end) {
|
||||
tuple->SetNull(slot_desc->null_indicator_offset());
|
||||
return true;
|
||||
}
|
||||
// Will be changed in conversion functions for error checking.
|
||||
char* end_ptr = const_cast<char*>(end);
|
||||
// TODO: Handle out-of-range conditions.
|
||||
switch (slot_desc->type()) {
|
||||
case TYPE_BOOLEAN: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
if (iequals(begin, "true")) {
|
||||
*reinterpret_cast<char*>(slot) = true;
|
||||
} else if (iequals(begin, "false")) {
|
||||
*reinterpret_cast<char*>(slot) = false;
|
||||
} else {
|
||||
// Inconvertible value. Set to NULL after switch statement.
|
||||
end_ptr = const_cast<char*>(begin);
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TYPE_TINYINT: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<int8_t*>(slot) =
|
||||
static_cast<int8_t>(strtol(begin, &end_ptr, 0));
|
||||
break;
|
||||
}
|
||||
case TYPE_SMALLINT: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<int16_t*>(slot) =
|
||||
static_cast<int16_t>(strtol(begin, &end_ptr, 0));
|
||||
break;
|
||||
}
|
||||
case TYPE_INT: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<int32_t*>(slot) =
|
||||
static_cast<int32_t>(strtol(begin, &end_ptr, 0));
|
||||
break;
|
||||
}
|
||||
case TYPE_BIGINT: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<int64_t*>(slot) = strtol(begin, &end_ptr, 0);
|
||||
break;
|
||||
}
|
||||
case TYPE_FLOAT: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<float*>(slot) =
|
||||
static_cast<float>(strtod(begin, &end_ptr));
|
||||
break;
|
||||
}
|
||||
case TYPE_DOUBLE: {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
*reinterpret_cast<double*>(slot) = strtod(begin, &end_ptr);
|
||||
break;
|
||||
}
|
||||
case TYPE_STRING: {
|
||||
StringValue* slot = tuple->GetStringSlot(slot_desc->tuple_offset());
|
||||
const char* data_start = NULL;
|
||||
slot->len = end - begin;
|
||||
data_start = begin;
|
||||
|
||||
if (!copy_string) {
|
||||
DCHECK(!unescape_string);
|
||||
slot->ptr = const_cast<char*>(data_start);
|
||||
} else {
|
||||
char* slot_data = reinterpret_cast<char*>(var_len_pool_->Allocate(slot->len));
|
||||
if (unescape_string) {
|
||||
UnescapeString(data_start, slot_data, &slot->len);
|
||||
} else {
|
||||
memcpy(slot_data, data_start, slot->len);
|
||||
}
|
||||
slot->ptr = slot_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TYPE_TIMESTAMP : {
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
string strbuf(begin, end - begin);
|
||||
*reinterpret_cast<TimestampValue*>(slot) = TimestampValue(strbuf);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
|
||||
}
|
||||
// Set NULL if inconvertible.
|
||||
if (*end_ptr != '\0' && slot_desc->type() != TYPE_STRING) {
|
||||
tuple->SetNull(slot_desc->null_indicator_offset());
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void TextConverter::UnescapeString(StringValue* value) {
|
||||
char* new_data = reinterpret_cast<char*>(var_len_pool_->Allocate(value->len));
|
||||
UnescapeString(value->ptr, new_data, &value->len);
|
||||
|
||||
@@ -3,29 +3,32 @@
|
||||
#ifndef IMPALA_EXEC_TEXT_CONVERTER_H
|
||||
#define IMPALA_EXEC_TEXT_CONVERTER_H
|
||||
|
||||
#include "runtime/runtime-state.h"
|
||||
namespace impala {
|
||||
|
||||
class Tuple;
|
||||
class SlotDescriptor;
|
||||
class MemPool;
|
||||
class StringValue;
|
||||
class Status;
|
||||
|
||||
// Helper class for dealing with text data, e.g., converting text data to numeric types, etc.
|
||||
// Helper class for dealing with text data, e.g., converting text data to
|
||||
// numeric types, etc.
|
||||
class TextConverter {
|
||||
public:
|
||||
TextConverter(char escape_char, MemPool* var_len_pool);
|
||||
|
||||
// Converts slot data (begin, end) into type of slot_desc,
|
||||
// Converts slot data, of length 'len', into type of slot_desc,
|
||||
// and writes the result into the tuples's slot.
|
||||
// copy_string indicates whether we need to make a separate copy of the string data:
|
||||
// For regular unescaped strings, we point to the original data in the file_buf_.
|
||||
// For regular escaped strings,
|
||||
// we copy an its unescaped string into a separate buffer and point to it.
|
||||
// Unsuccessful conversions are turned into NULLs.
|
||||
// Returns true if value was converted and written successfully, false otherwise.
|
||||
bool ConvertAndWriteSlotBytes(const char* begin,
|
||||
const char* end, Tuple* tuple, const SlotDescriptor* slot_desc,
|
||||
bool copy_string, bool unescape_string);
|
||||
// Returns Status::OK if the value was written successfully, error otherwise
|
||||
Status WriteSlot(RuntimeState* state, const SlotDescriptor* slot_desc,
|
||||
Tuple* tuple, const char* data, int len,
|
||||
bool copy_string, bool need_escape);
|
||||
|
||||
// Removes escape characters from len characters of the null-terminated string src,
|
||||
// and copies the unescaped string into dest, changing *len to the unescaped length.
|
||||
|
||||
104
be/src/exec/text-converter.inline.h
Normal file
104
be/src/exec/text-converter.inline.h
Normal file
@@ -0,0 +1,104 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
|
||||
#include "runtime/runtime-state.h"
|
||||
#include "text-converter.h"
|
||||
#include <boost/algorithm/string.hpp>
|
||||
#include <glog/logging.h>
|
||||
#include "runtime/descriptors.h"
|
||||
#include "runtime/tuple.h"
|
||||
#include "util/string-parser.h"
|
||||
#include "runtime/string-value.h"
|
||||
#include "runtime/timestamp-value.h"
|
||||
#include "runtime/mem-pool.h"
|
||||
|
||||
using namespace boost;
|
||||
using namespace impala;
|
||||
using namespace std;
|
||||
|
||||
// TODO: Needs to be codegen rather than inline.
|
||||
inline Status TextConverter::WriteSlot(RuntimeState* state,
|
||||
const SlotDescriptor* slot_desc, Tuple* tuple,
|
||||
const char* data, int len,
|
||||
bool copy_string, bool need_escape) {
|
||||
|
||||
bool fail = false;
|
||||
|
||||
if (len == 0) {
|
||||
tuple->SetNull(slot_desc->null_indicator_offset());
|
||||
} else {
|
||||
StringParser::ParseResult parse_result = StringParser::PARSE_SUCCESS;
|
||||
void* slot = tuple->GetSlot(slot_desc->tuple_offset());
|
||||
|
||||
// Parse the raw-text data. Translate the text string to internal format.
|
||||
switch (slot_desc->type()) {
|
||||
case TYPE_STRING: {
|
||||
StringValue* str_slot = reinterpret_cast<StringValue*>(slot);
|
||||
str_slot->ptr = const_cast<char*>(data);
|
||||
str_slot->len = len;
|
||||
if (copy_string || need_escape) {
|
||||
char* slot_data = reinterpret_cast<char*>(var_len_pool_->Allocate(len));
|
||||
if (need_escape) {
|
||||
UnescapeString(data, slot_data, &str_slot->len);
|
||||
} else {
|
||||
memcpy(slot_data, data, str_slot->len);
|
||||
}
|
||||
str_slot->ptr = slot_data;
|
||||
}
|
||||
break;
|
||||
}
|
||||
case TYPE_BOOLEAN:
|
||||
*reinterpret_cast<bool*>(slot) =
|
||||
StringParser::StringToBool(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_TINYINT:
|
||||
*reinterpret_cast<int8_t*>(slot) =
|
||||
StringParser::StringToInt<int8_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_SMALLINT:
|
||||
*reinterpret_cast<int16_t*>(slot) =
|
||||
StringParser::StringToInt<int16_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_INT:
|
||||
*reinterpret_cast<int32_t*>(slot) =
|
||||
StringParser::StringToInt<int32_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_BIGINT:
|
||||
*reinterpret_cast<int64_t*>(slot) =
|
||||
StringParser::StringToInt<int64_t>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_FLOAT:
|
||||
*reinterpret_cast<float*>(slot) =
|
||||
StringParser::StringToFloat<float>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_DOUBLE:
|
||||
*reinterpret_cast<double*>(slot) =
|
||||
StringParser::StringToFloat<double>(data, len, &parse_result);
|
||||
break;
|
||||
case TYPE_TIMESTAMP: {
|
||||
string strbuf(data, len);
|
||||
*reinterpret_cast<TimestampValue*>(slot) = TimestampValue(strbuf);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
DCHECK(false) << "bad slot type: " << TypeToString(slot_desc->type());
|
||||
break;
|
||||
}
|
||||
|
||||
// TODO: add warning for overflow case
|
||||
if (parse_result == StringParser::PARSE_FAILURE) {
|
||||
fail = true;
|
||||
tuple->SetNull(slot_desc->null_indicator_offset());
|
||||
if (state->LogHasSpace()) {
|
||||
state->error_stream()
|
||||
<< "Error converting column: " << slot_desc->col_pos() << " TO "
|
||||
// TODO: num_partition_keys_ no longer visible to scanner.
|
||||
// << slot_desc->col_pos() - num_partition_keys_ << " TO "
|
||||
<< TypeToString(slot_desc->type()) << "Data is: " << string(data,len) << endl;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (fail) return Status("Conversion from text failed");
|
||||
return Status::OK;
|
||||
}
|
||||
|
||||
@@ -64,6 +64,7 @@ target_link_libraries(expr-test
|
||||
gtest
|
||||
${Boost_LIBRARIES}
|
||||
${LLVM_MODULE_LIBS}
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
add_test(expr-test ${BUILD_OUTPUT_ROOT_DIRECTORY}/exprs/expr-test)
|
||||
|
||||
@@ -32,6 +32,7 @@ target_link_libraries(Runtime
|
||||
Exec
|
||||
TestUtil
|
||||
${Boost_LIBRARIES}
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
add_executable(mem-pool-test
|
||||
@@ -98,6 +99,7 @@ target_link_libraries(data-stream-test
|
||||
gtest
|
||||
${Boost_LIBRARIES}
|
||||
${LLVM_MODULE_LIBS}
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
add_test(mem-pool-test ${BUILD_OUTPUT_ROOT_DIRECTORY}/runtime/mem-pool-test)
|
||||
|
||||
@@ -231,6 +231,7 @@ Status DescriptorTbl::Create(ObjectPool* pool, const TDescriptorTable& thrift_tb
|
||||
switch (tdesc.tableType) {
|
||||
case TTableType::HDFS_TEXT_TABLE:
|
||||
case TTableType::HDFS_RCFILE_TABLE:
|
||||
case TTableType::HDFS_SEQFILE_TABLE:
|
||||
desc = pool->Add(new HdfsTableDescriptor(tdesc));
|
||||
break;
|
||||
case TTableType::HBASE_TABLE:
|
||||
|
||||
@@ -46,6 +46,7 @@ target_link_libraries(backend
|
||||
gflagsstatic
|
||||
# tcmallocstatic
|
||||
pprofstatic
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
add_executable(runquery
|
||||
@@ -84,6 +85,7 @@ target_link_libraries(runquery
|
||||
gflagsstatic
|
||||
tcmallocstatic
|
||||
pprofstatic
|
||||
-lz
|
||||
)
|
||||
|
||||
add_executable(impalad
|
||||
|
||||
@@ -19,6 +19,7 @@ target_link_libraries(TestUtil
|
||||
ImpalaThrift
|
||||
glogstatic
|
||||
gflagsstatic
|
||||
-lz -lbz2 -lsnappy
|
||||
)
|
||||
|
||||
add_executable(query-jitter
|
||||
|
||||
@@ -13,7 +13,7 @@ make -j
|
||||
cd $IMPALA_HOME
|
||||
|
||||
# Run sample queries - outputs .gcda files
|
||||
be/build/release/service/runquery -query="select count(field) from grep1gb where field like '%xyz%';select sourceIP, SUM(adRevenue) FROM uservisits GROUP by sourceIP order by SUM(adRevenue) desc limit 10;select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue from uservisits uv join rankings r on (r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate < '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1" -profile_output_file=""
|
||||
be/build/release/service/runquery -query="select count(field) from grep1gb where field like '%xyz%';select count(field) from grep1gb_seq_snap where field like '%xyz%';select sourceIP, SUM(adRevenue) FROM uservisits_seq GROUP by sourceIP order by SUM(adRevenue) desc limit 10;select sourceIP, SUM(adRevenue) FROM uservisits GROUP by sourceIP order by SUM(adRevenue) desc limit 10;select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue from uservisits uv join rankings r on (r.pageurl = uv.desturl) where uv.visitdate > '1999-01-01' and uv.visitdate < '2000-01-01' group by uv.sourceip order by totalrevenue desc limit 1" -profile_output_file=""
|
||||
|
||||
# Build again using the PGO data
|
||||
cmake -DCMAKE_BUILD_TYPE=PROFILE_BUILD .
|
||||
|
||||
@@ -214,10 +214,17 @@ queries = [
|
||||
["select count(*) from grep1gb", 5, 5],
|
||||
["select count(field) from grep1gb", 0, 5],
|
||||
["select count(field) from grep1gb where field like '%xyz%'", 0, 5],
|
||||
["select count(*) from grep1gb_seq_snap", 5, 5],
|
||||
["select count(field) from grep1gb_seq_snap", 0, 5],
|
||||
["select count(field) from grep1gb_seq_snap where field like '%xyz%'", 0, 5],
|
||||
["select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue "\
|
||||
"from uservisits uv join rankings r on (r.pageurl = uv.desturl) "\
|
||||
"where uv.visitdate > '1999-01-01' and uv.visitdate < '2000-01-01' "\
|
||||
"group by uv.sourceip order by totalrevenue desc limit 1", 5, 5],
|
||||
["select uv.sourceip, avg(r.pagerank), sum(uv.adrevenue) as totalrevenue "\
|
||||
"from uservisits_seq uv join rankings r on (r.pageurl = uv.desturl) "\
|
||||
"where uv.visitdate > '1999-01-01' and uv.visitdate < '2000-01-01' "\
|
||||
"group by uv.sourceip order by totalrevenue desc limit 1", 5, 5],
|
||||
["select sourceIP, SUM(adRevenue) FROM uservisits GROUP by sourceIP "\
|
||||
"order by SUM(adRevenue) desc limit 10", 5, 5],
|
||||
["select pageRank, pageURL from rankings where pageRank > 10 "\
|
||||
|
||||
@@ -19,6 +19,7 @@ struct TSlotDescriptor {
|
||||
enum TTableType {
|
||||
HDFS_TEXT_TABLE,
|
||||
HDFS_RCFILE_TABLE,
|
||||
HDFS_SEQFILE_TABLE,
|
||||
HBASE_TABLE
|
||||
}
|
||||
|
||||
|
||||
@@ -9,6 +9,7 @@ include "Types.thrift"
|
||||
enum TPlanNodeType {
|
||||
HDFS_TEXT_SCAN_NODE,
|
||||
HDFS_RCFILE_SCAN_NODE,
|
||||
HDFS_SEQFILE_SCAN_NODE,
|
||||
HBASE_SCAN_NODE,
|
||||
HASH_JOIN_NODE,
|
||||
AGGREGATION_NODE,
|
||||
|
||||
@@ -0,0 +1,41 @@
|
||||
// Copyright (c) 2011 Cloudera, Inc. All rights reserved.
|
||||
package com.cloudera.impala.catalog;
|
||||
|
||||
import java.util.List;
|
||||
|
||||
import com.cloudera.impala.analysis.Expr;
|
||||
import com.cloudera.impala.planner.DataSink;
|
||||
import com.cloudera.impala.thrift.TTableDescriptor;
|
||||
import com.cloudera.impala.thrift.TTableType;
|
||||
|
||||
/**
|
||||
* Sequence Table.
|
||||
*
|
||||
*/
|
||||
public class HdfsSeqFileTable extends HdfsTable {
|
||||
|
||||
// Input format class for Sequence tables read by Hive.
|
||||
private static final String sequenceFileInputFormat =
|
||||
"org.apache.hadoop.mapred.SequenceFileInputFormat";
|
||||
|
||||
protected HdfsSeqFileTable(TableId id, Db db, String name, String owner) {
|
||||
super(id, db, name, owner);
|
||||
}
|
||||
|
||||
@Override
|
||||
public TTableDescriptor toThrift() {
|
||||
TTableDescriptor tTable = super.toThrift();
|
||||
tTable.setTableType(TTableType.HDFS_SEQFILE_TABLE);
|
||||
return tTable;
|
||||
}
|
||||
|
||||
public static boolean isSeqFileTable(org.apache.hadoop.hive.metastore.api.Table msTbl) {
|
||||
return msTbl.getSd().getInputFormat().equals(sequenceFileInputFormat);
|
||||
}
|
||||
|
||||
@Override
|
||||
public DataSink createDataSink(List<Expr> partitionKeyExprs, boolean overwrite) {
|
||||
throw new UnsupportedOperationException("HdfsSeqFile Output Sink not implemented.");
|
||||
}
|
||||
}
|
||||
|
||||
@@ -25,9 +25,6 @@ import com.google.common.collect.Maps;
|
||||
* for the clustering columns, those two rows are most likely colocated. Note that this
|
||||
* is more general than Hive's CLUSTER BY ... INTO BUCKETS clause (which partitions
|
||||
* a key range into a fixed number of buckets).
|
||||
*
|
||||
* Current subclasses are HdfsTextTable, HdfsRCFileTable, and HBaseTable.
|
||||
*
|
||||
*/
|
||||
public abstract class Table {
|
||||
protected final TableId id;
|
||||
@@ -81,7 +78,7 @@ public abstract class Table {
|
||||
* @param db
|
||||
* @param tblName
|
||||
* @return
|
||||
* new instance of Hdfs[Text|RCFile]Table or HBaseTable
|
||||
* new instance of Hdfs[Text|RCFile|Seq]Table or HBaseTable
|
||||
* null if loading table failed
|
||||
*/
|
||||
public static Table load(TableId id, HiveMetaStoreClient client, Db db,
|
||||
@@ -98,6 +95,8 @@ public abstract class Table {
|
||||
table = new HdfsTextTable(id, db, tblName, msTbl.getOwner());
|
||||
} else if (HdfsRCFileTable.isRCFileTable(msTbl)) {
|
||||
table = new HdfsRCFileTable(id, db, tblName, msTbl.getOwner());
|
||||
} else if (HdfsSeqFileTable.isSeqFileTable(msTbl)) {
|
||||
table = new HdfsSeqFileTable(id, db, tblName, msTbl.getOwner());
|
||||
} else {
|
||||
throw new UnsupportedOperationException("Unrecognized table type");
|
||||
}
|
||||
|
||||
@@ -0,0 +1,24 @@
|
||||
// Copyright (c) 2012 Cloudera, Inc. All rights reserved.
|
||||
package com.cloudera.impala.planner;
|
||||
|
||||
import com.cloudera.impala.analysis.TupleDescriptor;
|
||||
import com.cloudera.impala.catalog.HdfsTable;
|
||||
import com.cloudera.impala.thrift.TPlanNode;
|
||||
import com.cloudera.impala.thrift.TPlanNodeType;
|
||||
|
||||
/**
|
||||
* HdfsSeqFileScanNode.
|
||||
*
|
||||
*/
|
||||
public class HdfsSeqFileScanNode extends HdfsScanNode {
|
||||
|
||||
public HdfsSeqFileScanNode(int id, TupleDescriptor desc, HdfsTable tbl) {
|
||||
super(id, desc, tbl);
|
||||
}
|
||||
|
||||
@Override
|
||||
protected void toThrift(TPlanNode msg) {
|
||||
super.toThrift(msg);
|
||||
msg.node_type = TPlanNodeType.HDFS_SEQFILE_SCAN_NODE;
|
||||
}
|
||||
}
|
||||
@@ -28,6 +28,7 @@ import com.cloudera.impala.analysis.TableRef;
|
||||
import com.cloudera.impala.analysis.TupleDescriptor;
|
||||
import com.cloudera.impala.analysis.TupleId;
|
||||
import com.cloudera.impala.catalog.HdfsRCFileTable;
|
||||
import com.cloudera.impala.catalog.HdfsSeqFileTable;
|
||||
import com.cloudera.impala.catalog.HdfsTextTable;
|
||||
import com.cloudera.impala.catalog.PrimitiveType;
|
||||
import com.cloudera.impala.common.InternalException;
|
||||
@@ -184,6 +185,10 @@ public class Planner {
|
||||
// Hive RCFile table
|
||||
scanNode = new HdfsRCFileScanNode(
|
||||
getNextNodeId(), tblRef.getDesc(), (HdfsRCFileTable) tblRef.getTable());
|
||||
} else if (tblRef.getTable() instanceof HdfsSeqFileTable) {
|
||||
// Hive Sequence table
|
||||
scanNode = new HdfsSeqFileScanNode(
|
||||
getNextNodeId(), tblRef.getDesc(), (HdfsSeqFileTable) tblRef.getTable());
|
||||
} else {
|
||||
// HBase table
|
||||
scanNode = new HBaseScanNode(getNextNodeId(), tblRef.getDesc());
|
||||
|
||||
@@ -23,6 +23,7 @@ public class DataErrorsTest {
|
||||
private static Executor executor;
|
||||
private static StringBuilder testErrorLog;
|
||||
private final String testDir = "DataErrorsTest";
|
||||
private static ArrayList<String> tableList;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
@@ -30,81 +31,91 @@ public class DataErrorsTest {
|
||||
catalog = new Catalog(client);
|
||||
executor = new Executor(catalog);
|
||||
testErrorLog = new StringBuilder();
|
||||
tableList = new ArrayList<String>();
|
||||
tableList.add("");
|
||||
tableList.add("_rc");
|
||||
tableList.add("_seq");
|
||||
tableList.add("_seq_def");
|
||||
tableList.add("_seq_gzip");
|
||||
tableList.add("_seq_bzip");
|
||||
tableList.add("_seq_snap");
|
||||
tableList.add("_seq_record_def");
|
||||
tableList.add("_seq_record_gzip");
|
||||
tableList.add("_seq_record_bzip");
|
||||
tableList.add("_seq_record_snap");
|
||||
}
|
||||
|
||||
private void runErrorTestFile(String testFile, boolean abortOnError, int maxErrors) {
|
||||
private void runErrorTestFile(String testFile, boolean abortOnError, int maxErrors,
|
||||
ArrayList<String> tables) {
|
||||
StringBuilder errorLog = new StringBuilder();
|
||||
String fileName = testDir + "/" + testFile + ".test";
|
||||
TestFileParser queryFileParser = new TestFileParser(fileName);
|
||||
queryFileParser.parseFile();
|
||||
for (TestCase testCase : queryFileParser.getTestCases()) {
|
||||
ArrayList<String> expectedErrors = testCase.getSectionContents(Section.ERRORS);
|
||||
// The test file is assumed to contain all errors. We may only want to compare a few of them.
|
||||
int errorsToCompare = Math.min(expectedErrors.size(), maxErrors);
|
||||
int lastLine = 0;
|
||||
int errorCount = 0;
|
||||
for (String line : expectedErrors) {
|
||||
// Indicates the last line of one error message.
|
||||
// The final line of an Hdfs error message starts with "line:",
|
||||
// and for Hbase tables with "row key:".
|
||||
if (line.startsWith("line:") || line.startsWith("row key:")) {
|
||||
errorCount++;
|
||||
}
|
||||
lastLine++;
|
||||
if (errorCount >= errorsToCompare) {
|
||||
break;
|
||||
for (int f = 0; f < (tables == null ? 1 : tables.size()); f++) {
|
||||
queryFileParser.parseFile(tables == null ? null : tables.get(f));
|
||||
for (TestCase testCase : queryFileParser.getTestCases()) {
|
||||
ArrayList<String> expectedErrors = testCase.getSectionContents(Section.ERRORS);
|
||||
// The test file is assumed to contain all errors.
|
||||
// We may only want to compare a few of them.
|
||||
int errorsToCompare = Math.min(expectedErrors.size(), maxErrors);
|
||||
int lastLine = 0;
|
||||
int errorCount = 0;
|
||||
for (String line : expectedErrors) {
|
||||
// Indicates the last line of one error message.
|
||||
// The final line of an Hdfs error message starts with "line:",
|
||||
// and for Hbase tables with "row key:".
|
||||
if (line.startsWith("line:") || line.startsWith("row key:")) {
|
||||
errorCount++;
|
||||
}
|
||||
lastLine++;
|
||||
if (errorCount >= errorsToCompare) {
|
||||
break;
|
||||
}
|
||||
while (expectedErrors.size() > lastLine) {
|
||||
expectedErrors.remove(expectedErrors.size() - 1);
|
||||
}
|
||||
// File error entries must be sorted by filename within .test file.
|
||||
ArrayList<String> expectedFileErrors =
|
||||
testCase.getSectionContents(Section.FILEERRORS);
|
||||
if (abortOnError && !expectedFileErrors.isEmpty()) {
|
||||
String[] fileErrSplits = expectedFileErrors.get(0).split(",");
|
||||
// We are expecting only a single file with a single error.
|
||||
String expectedFileError = fileErrSplits[0] + ",1";
|
||||
expectedFileErrors.clear();
|
||||
expectedFileErrors.add(expectedFileError);
|
||||
}
|
||||
// run query 3 ways: with backend's default batch size, with small batch size,
|
||||
// and with batch size of 1, which should trigger a lot of corner cases
|
||||
// in the execution engine code
|
||||
String query = testCase.getQuery();
|
||||
TestUtils.runQuery(executor, query, 1, 0,
|
||||
abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
TestUtils.runQuery(executor, query, 1, 16,
|
||||
abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
TestUtils.runQuery(executor, query, 1, 1,
|
||||
abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
}
|
||||
}
|
||||
while (expectedErrors.size() > lastLine) {
|
||||
expectedErrors.remove(expectedErrors.size() - 1);
|
||||
}
|
||||
// File error entries must be sorted by filename within .test file.
|
||||
ArrayList<String> expectedFileErrors = testCase.getSectionContents(Section.FILEERRORS);
|
||||
if (abortOnError && !expectedFileErrors.isEmpty()) {
|
||||
String[] fileErrSplits = expectedFileErrors.get(0).split(",");
|
||||
// We are expecting only a single file with a single error.
|
||||
String expectedFileError = fileErrSplits[0] + ",1";
|
||||
expectedFileErrors.clear();
|
||||
expectedFileErrors.add(expectedFileError);
|
||||
}
|
||||
// run query 3 ways: with backend's default batch size, with small batch size,
|
||||
// and with batch size of 1, which should trigger a lot of corner cases
|
||||
// in the execution engine code
|
||||
String query = testCase.getQuery();
|
||||
TestUtils.runQuery(executor, query,
|
||||
1, 0, abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
TestUtils.runQuery(executor, query,
|
||||
1, 16, abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
TestUtils.runQuery(executor, query,
|
||||
1, 1, abortOnError, maxErrors, testCase.getStartingLineNum(), null, null, null,
|
||||
expectedErrors, expectedFileErrors, testErrorLog);
|
||||
}
|
||||
|
||||
if (errorLog.length() != 0) {
|
||||
fail(errorLog.toString());
|
||||
if (errorLog.length() != 0) {
|
||||
fail(errorLog.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsScanNodeErrors() {
|
||||
runErrorTestFile("hdfs-scan-node-errors", false, 100);
|
||||
runErrorTestFile("hdfs-scan-node-errors", false, 5);
|
||||
runErrorTestFile("hdfs-scan-node-errors", true, 1);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsRCFileScanNodeErrors() {
|
||||
runErrorTestFile("hdfs-rcfile-scan-node-errors", false, 100);
|
||||
runErrorTestFile("hdfs-rcfile-scan-node-errors", false, 5);
|
||||
runErrorTestFile("hdfs-rcfile-scan-node-errors", true, 1);
|
||||
runErrorTestFile("hdfs-scan-node-errors", false, 100, tableList);
|
||||
runErrorTestFile("hdfs-scan-node-errors", false, 5, tableList);
|
||||
runErrorTestFile("hdfs-scan-node-errors", true, 1, tableList);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHBaseScanNodeErrors() {
|
||||
runErrorTestFile("hbase-scan-node-errors", false, 100);
|
||||
runErrorTestFile("hbase-scan-node-errors", false, 5);
|
||||
runErrorTestFile("hbase-scan-node-errors", true, 1);
|
||||
runErrorTestFile("hbase-scan-node-errors", false, 100, null);
|
||||
runErrorTestFile("hbase-scan-node-errors", false, 5, null);
|
||||
runErrorTestFile("hbase-scan-node-errors", true, 1, null);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -21,43 +21,64 @@ public class QueryTest {
|
||||
private static Catalog catalog;
|
||||
private static Executor executor;
|
||||
private final String testDir = "QueryTest";
|
||||
private static ArrayList<String> tableSubsitutionList;
|
||||
|
||||
@BeforeClass
|
||||
public static void setUp() throws Exception {
|
||||
HiveMetaStoreClient client = TestSchemaUtils.createClient();
|
||||
catalog = new Catalog(client);
|
||||
executor = new Executor(catalog);
|
||||
tableSubsitutionList = new ArrayList<String>();
|
||||
tableSubsitutionList.add("");
|
||||
tableSubsitutionList.add("_rc");
|
||||
tableSubsitutionList.add("_seq");
|
||||
tableSubsitutionList.add("_seq_def");
|
||||
tableSubsitutionList.add("_seq_gzip");
|
||||
tableSubsitutionList.add("_seq_bzip");
|
||||
tableSubsitutionList.add("_seq_snap");
|
||||
tableSubsitutionList.add("_seq_record_def");
|
||||
tableSubsitutionList.add("_seq_record_gzip");
|
||||
tableSubsitutionList.add("_seq_record_bzip");
|
||||
tableSubsitutionList.add("_seq_record_snap");
|
||||
}
|
||||
|
||||
private void runQueryTestFile(String testFile, boolean abortOnError, int maxErrors) {
|
||||
runQueryTestFile(testFile, abortOnError, maxErrors, null);
|
||||
}
|
||||
|
||||
private void runQueryTestFile(String testFile, boolean abortOnError, int maxErrors,
|
||||
ArrayList<String> tables) {
|
||||
String fileName = testDir + "/" + testFile + ".test";
|
||||
TestFileParser queryFileParser = new TestFileParser(fileName);
|
||||
queryFileParser.parseFile();
|
||||
StringBuilder errorLog = new StringBuilder();
|
||||
for (TestCase testCase : queryFileParser.getTestCases()) {
|
||||
ArrayList<String> expectedTypes =
|
||||
testCase.getSectionContents(Section.TYPES);
|
||||
ArrayList<String> expectedResults =
|
||||
testCase.getSectionContents(Section.RESULTS);
|
||||
// run each test against all possible combinations of batch sizes and
|
||||
// number of execution nodes
|
||||
int[] batchSizes = {0, 16, 1};
|
||||
int[] numNodes = {1, 2, 3, 0};
|
||||
for (int i = 0; i < batchSizes.length; ++i) {
|
||||
for (int j = 0; j < numNodes.length; ++j) {
|
||||
TestUtils.runQuery(
|
||||
executor, testCase.getSectionAsString(Section.QUERY, false, " "),
|
||||
numNodes[j], batchSizes[i], abortOnError, maxErrors,
|
||||
testCase.getStartingLineNum(), null, expectedTypes,
|
||||
expectedResults, null, null, errorLog);
|
||||
for (int f = 0; f < (tables == null ? 1 : tables.size()); f++) {
|
||||
queryFileParser.parseFile(tables == null ? null : tables.get(f));
|
||||
StringBuilder errorLog = new StringBuilder();
|
||||
for (TestCase testCase : queryFileParser.getTestCases()) {
|
||||
ArrayList<String> expectedTypes =
|
||||
testCase.getSectionContents(Section.TYPES);
|
||||
ArrayList<String> expectedResults =
|
||||
testCase.getSectionContents(Section.RESULTS);
|
||||
// run each test against all possible combinations of batch sizes and
|
||||
// number of execution nodes
|
||||
int[] batchSizes = {0, 16, 1};
|
||||
int[] numNodes = {1, 2, 3, 0};
|
||||
for (int i = 0; i < batchSizes.length; ++i) {
|
||||
for (int j = 0; j < numNodes.length; ++j) {
|
||||
TestUtils.runQuery(
|
||||
executor, testCase.getSectionAsString(Section.QUERY, false, " "),
|
||||
numNodes[j], batchSizes[i], abortOnError, maxErrors,
|
||||
testCase.getStartingLineNum(), null, expectedTypes,
|
||||
expectedResults, null, null, errorLog);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
if (errorLog.length() != 0) {
|
||||
fail(errorLog.toString());
|
||||
if (errorLog.length() != 0) {
|
||||
fail(errorLog.toString());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
@Test
|
||||
public void TestDistinct() {
|
||||
runQueryTestFile("distinct", false, 1000);
|
||||
@@ -74,23 +95,13 @@ public class QueryTest {
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsTextScanNode() {
|
||||
runQueryTestFile("hdfs-scan-node", false, 1000);
|
||||
public void TestHdfsScanNode() {
|
||||
runQueryTestFile("hdfs-scan-node", false, 1000, tableSubsitutionList);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsTextPartitions() {
|
||||
runQueryTestFile("hdfs-partitions", false, 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsRCFileScanNode() {
|
||||
runQueryTestFile("hdfs-rcfile-scan-node", false, 1000);
|
||||
}
|
||||
|
||||
@Test
|
||||
public void TestHdfsRCFilePartitions() {
|
||||
runQueryTestFile("hdfs-rcfile-partitions", false, 1000);
|
||||
public void TestFilePartions() {
|
||||
runQueryTestFile("hdfs-partitions", false, 1000, tableSubsitutionList);
|
||||
}
|
||||
|
||||
@Test
|
||||
|
||||
@@ -141,6 +141,7 @@ public class TestFileParser {
|
||||
private final String fileName;
|
||||
private InputStream stream;
|
||||
private Scanner scanner;
|
||||
private String table;
|
||||
|
||||
/**
|
||||
* For backwards compatibility, if no title is found this is the order in which
|
||||
@@ -160,7 +161,8 @@ public class TestFileParser {
|
||||
/**
|
||||
* Initialises the scanner and the input stream corresponding to the test file name
|
||||
*/
|
||||
private void open() {
|
||||
private void open(String table) {
|
||||
this.table = table;
|
||||
try {
|
||||
ClassLoader classLoader = Thread.currentThread().getContextClassLoader();
|
||||
stream = classLoader.getResourceAsStream(fileName);
|
||||
@@ -218,6 +220,9 @@ public class TestFileParser {
|
||||
|
||||
sectionContents = Lists.newArrayList();
|
||||
} else {
|
||||
if (table != null && currentSection == Section.QUERY) {
|
||||
line = line.replaceAll("\\$TABLE", table);
|
||||
}
|
||||
sectionContents.add(line);
|
||||
}
|
||||
}
|
||||
@@ -229,7 +234,11 @@ public class TestFileParser {
|
||||
* Parses a test file in its entirety and constructs a list of TestCases.
|
||||
*/
|
||||
public void parseFile() {
|
||||
open();
|
||||
parseFile(null);
|
||||
}
|
||||
|
||||
public void parseFile(String table) {
|
||||
open(table);
|
||||
while (scanner.hasNextLine()) {
|
||||
testCases.add(parseOneTestCase());
|
||||
}
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col from alltypeserror
|
||||
<<<<<<< HEAD
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col timestamp_col from alltypeserror$TABLE
|
||||
---- ERRORS
|
||||
Error converting column: 1 TO BOOL
|
||||
file: alltypeserror/year=2009/month=1/0901.txt
|
||||
@@ -58,7 +59,7 @@ file: alltypeserror/year=2009/month=1/0901.txt,8
|
||||
file: alltypeserror/year=2009/month=2/0902.txt,3
|
||||
file: alltypeserror/year=2009/month=3/0903.txt,4
|
||||
====
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col from alltypeserrornonulls
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col, timestamp_col from alltypeserrornonulls$TABLE
|
||||
---- ERRORS
|
||||
Error converting column: 1 TO BOOL
|
||||
file: alltypeserrornonulls/year=2009/month=1/0901.txt
|
||||
|
||||
@@ -1,11 +1,11 @@
|
||||
select year, count(*) from alltypes group by 1
|
||||
select year, count(*) from alltypes$TABLE group by 1
|
||||
----
|
||||
int, bigint
|
||||
----
|
||||
2009,3650
|
||||
2010,3650
|
||||
====
|
||||
select month, count(*) from alltypes group by 1
|
||||
select month, count(*) from alltypes$TABLE group by 1
|
||||
----
|
||||
int, bigint
|
||||
----
|
||||
@@ -22,7 +22,7 @@ int, bigint
|
||||
6,600
|
||||
7,620
|
||||
====
|
||||
select year, month, count(*) from alltypes group by 1, 2
|
||||
select year, month, count(*) from alltypes$TABLE group by 1, 2
|
||||
----
|
||||
int, int, bigint
|
||||
----
|
||||
@@ -51,82 +51,82 @@ int, int, bigint
|
||||
2009,1,310
|
||||
2009,2,280
|
||||
====
|
||||
select count(*) from alltypes where year=2009
|
||||
select count(*) from alltypes$TABLE where year=2009
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
# still works if 'year' needs a cast
|
||||
select count(*) from alltypes where year = 2009.0
|
||||
select count(*) from alltypes$TABLE where year = 2009.0
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
# finds bindings for partition keys regardless of order of operands
|
||||
select count(*) from alltypes where 2009 = year
|
||||
select count(*) from alltypes$TABLE where 2009 = year
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
select count(*) from alltypes where 2009.0 = year
|
||||
select count(*) from alltypes$TABLE where 2009.0 = year
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
select count(*) from alltypes where month=1
|
||||
select count(*) from alltypes$TABLE where month=1
|
||||
----
|
||||
bigint
|
||||
----
|
||||
620
|
||||
====
|
||||
select count(*) from alltypes where year=2009 and month=1
|
||||
select count(*) from alltypes$TABLE where year=2009 and month=1
|
||||
----
|
||||
bigint
|
||||
----
|
||||
310
|
||||
====
|
||||
select count(*) from alltypes where year=2009 and month > 6
|
||||
select count(*) from alltypes$TABLE where year=2009 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1840
|
||||
====
|
||||
select count(*) from alltypes where year=2009 and month < 6
|
||||
select count(*) from alltypes$TABLE where year=2009 and month < 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1510
|
||||
====
|
||||
select count(*) from alltypes where year<=2009 and month < 6
|
||||
select count(*) from alltypes$TABLE where year<=2009 and month < 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1510
|
||||
====
|
||||
select count(*) from alltypes where month < 9 and month > 6
|
||||
select count(*) from alltypes$TABLE where month < 9 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1240
|
||||
====
|
||||
select count(*) from alltypes where year < 2010 and year < 2009 and month > 6
|
||||
select count(*) from alltypes$TABLE where year < 2010 and year < 2009 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
0
|
||||
====
|
||||
select count(*) from alltypes where year < 2010 and month > 6 and month > 12
|
||||
select count(*) from alltypes$TABLE where year < 2010 and month > 6 and month > 12
|
||||
----
|
||||
bigint
|
||||
----
|
||||
0
|
||||
====
|
||||
# Test multi files partitioned table (hdfs)
|
||||
select count(*) from alltypesaggmultifiles
|
||||
select count(*) from alltypesaggmultifiles$TABLE
|
||||
----
|
||||
bigint
|
||||
----
|
||||
|
||||
@@ -1,134 +0,0 @@
|
||||
select year, count(*) from alltypes_rc group by 1
|
||||
----
|
||||
int, bigint
|
||||
----
|
||||
2009,3650
|
||||
2010,3650
|
||||
====
|
||||
select month, count(*) from alltypes_rc group by 1
|
||||
----
|
||||
int, bigint
|
||||
----
|
||||
8,620
|
||||
9,600
|
||||
10,620
|
||||
11,600
|
||||
12,620
|
||||
1,620
|
||||
2,560
|
||||
3,620
|
||||
4,600
|
||||
5,620
|
||||
6,600
|
||||
7,620
|
||||
====
|
||||
select year, month, count(*) from alltypes_rc group by 1, 2
|
||||
----
|
||||
int, int, bigint
|
||||
----
|
||||
2010,2,280
|
||||
2009,5,310
|
||||
2010,1,310
|
||||
2009,6,300
|
||||
2009,3,310
|
||||
2009,4,300
|
||||
2009,9,300
|
||||
2009,10,310
|
||||
2010,6,300
|
||||
2009,7,310
|
||||
2010,5,310
|
||||
2009,8,310
|
||||
2010,4,300
|
||||
2010,3,310
|
||||
2010,10,310
|
||||
2009,11,300
|
||||
2010,9,300
|
||||
2009,12,310
|
||||
2010,8,310
|
||||
2010,7,310
|
||||
2010,12,310
|
||||
2010,11,300
|
||||
2009,1,310
|
||||
2009,2,280
|
||||
====
|
||||
select count(*) from alltypes_rc where year=2009
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
# still works if 'year' needs a cast
|
||||
select count(*) from alltypes_rc where year = 2009.0
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
# finds bindings for partition keys regardless of order of operands
|
||||
select count(*) from alltypes_rc where 2009 = year
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
select count(*) from alltypes_rc where 2009.0 = year
|
||||
----
|
||||
bigint
|
||||
----
|
||||
3650
|
||||
====
|
||||
select count(*) from alltypes_rc where month=1
|
||||
----
|
||||
bigint
|
||||
----
|
||||
620
|
||||
====
|
||||
select count(*) from alltypes_rc where year=2009 and month=1
|
||||
----
|
||||
bigint
|
||||
----
|
||||
310
|
||||
====
|
||||
select count(*) from alltypes_rc where year=2009 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1840
|
||||
====
|
||||
select count(*) from alltypes_rc where year=2009 and month < 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1510
|
||||
====
|
||||
select count(*) from alltypes_rc where year<=2009 and month < 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1510
|
||||
====
|
||||
select count(*) from alltypes_rc where month < 9 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
1240
|
||||
====
|
||||
select count(*) from alltypes_rc where year < 2010 and year < 2009 and month > 6
|
||||
----
|
||||
bigint
|
||||
----
|
||||
0
|
||||
====
|
||||
select count(*) from alltypes_rc where year < 2010 and month > 6 and month > 12
|
||||
----
|
||||
bigint
|
||||
----
|
||||
0
|
||||
====
|
||||
# Test multi files partitioned table (rc)
|
||||
select count(*) from alltypesaggmultifiles_rc
|
||||
----
|
||||
bigint
|
||||
----
|
||||
10000
|
||||
====
|
||||
@@ -1,509 +0,0 @@
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col,
|
||||
double_col, date_string_col, string_col
|
||||
from alltypessmall_rc
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
-----
|
||||
0,true,0,0,0,0,0,0,'01/01/09','0'
|
||||
1,false,1,1,1,10,1.1,10.1,'01/01/09','1'
|
||||
2,true,2,2,2,20,2.2,20.2,'01/01/09','2'
|
||||
3,false,3,3,3,30,3.3,30.3,'01/01/09','3'
|
||||
4,true,4,4,4,40,4.4,40.4,'01/01/09','4'
|
||||
5,false,5,5,5,50,5.5,50.5,'01/01/09','5'
|
||||
6,true,6,6,6,60,6.6,60.6,'01/01/09','6'
|
||||
7,false,7,7,7,70,7.7,70.7,'01/01/09','7'
|
||||
8,true,8,8,8,80,8.8,80.8,'01/01/09','8'
|
||||
9,false,9,9,9,90,9.9,90.9,'01/01/09','9'
|
||||
10,true,0,0,0,0,0,0,'01/02/09','0'
|
||||
11,false,1,1,1,10,1.1,10.1,'01/02/09','1'
|
||||
12,true,2,2,2,20,2.2,20.2,'01/02/09','2'
|
||||
13,false,3,3,3,30,3.3,30.3,'01/02/09','3'
|
||||
14,true,4,4,4,40,4.4,40.4,'01/02/09','4'
|
||||
15,false,5,5,5,50,5.5,50.5,'01/02/09','5'
|
||||
16,true,6,6,6,60,6.6,60.6,'01/02/09','6'
|
||||
17,false,7,7,7,70,7.7,70.7,'01/02/09','7'
|
||||
18,true,8,8,8,80,8.8,80.8,'01/02/09','8'
|
||||
19,false,9,9,9,90,9.9,90.9,'01/02/09','9'
|
||||
20,true,0,0,0,0,0,0,'01/03/09','0'
|
||||
21,false,1,1,1,10,1.1,10.1,'01/03/09','1'
|
||||
22,true,2,2,2,20,2.2,20.2,'01/03/09','2'
|
||||
23,false,3,3,3,30,3.3,30.3,'01/03/09','3'
|
||||
24,true,4,4,4,40,4.4,40.4,'01/03/09','4'
|
||||
25,false,0,0,0,0,0,0,'02/01/09','0'
|
||||
26,true,1,1,1,10,1.1,10.1,'02/01/09','1'
|
||||
27,false,2,2,2,20,2.2,20.2,'02/01/09','2'
|
||||
28,true,3,3,3,30,3.3,30.3,'02/01/09','3'
|
||||
29,false,4,4,4,40,4.4,40.4,'02/01/09','4'
|
||||
30,true,5,5,5,50,5.5,50.5,'02/01/09','5'
|
||||
31,false,6,6,6,60,6.6,60.6,'02/01/09','6'
|
||||
32,true,7,7,7,70,7.7,70.7,'02/01/09','7'
|
||||
33,false,8,8,8,80,8.8,80.8,'02/01/09','8'
|
||||
34,true,9,9,9,90,9.9,90.9,'02/01/09','9'
|
||||
35,false,0,0,0,0,0,0,'02/02/09','0'
|
||||
36,true,1,1,1,10,1.1,10.1,'02/02/09','1'
|
||||
37,false,2,2,2,20,2.2,20.2,'02/02/09','2'
|
||||
38,true,3,3,3,30,3.3,30.3,'02/02/09','3'
|
||||
39,false,4,4,4,40,4.4,40.4,'02/02/09','4'
|
||||
40,true,5,5,5,50,5.5,50.5,'02/02/09','5'
|
||||
41,false,6,6,6,60,6.6,60.6,'02/02/09','6'
|
||||
42,true,7,7,7,70,7.7,70.7,'02/02/09','7'
|
||||
43,false,8,8,8,80,8.8,80.8,'02/02/09','8'
|
||||
44,true,9,9,9,90,9.9,90.9,'02/02/09','9'
|
||||
45,false,0,0,0,0,0,0,'02/03/09','0'
|
||||
46,true,1,1,1,10,1.1,10.1,'02/03/09','1'
|
||||
47,false,2,2,2,20,2.2,20.2,'02/03/09','2'
|
||||
48,true,3,3,3,30,3.3,30.3,'02/03/09','3'
|
||||
49,false,4,4,4,40,4.4,40.4,'02/03/09','4'
|
||||
50,true,0,0,0,0,0,0,'03/01/09','0'
|
||||
51,false,1,1,1,10,1.1,10.1,'03/01/09','1'
|
||||
52,true,2,2,2,20,2.2,20.2,'03/01/09','2'
|
||||
53,false,3,3,3,30,3.3,30.3,'03/01/09','3'
|
||||
54,true,4,4,4,40,4.4,40.4,'03/01/09','4'
|
||||
55,false,5,5,5,50,5.5,50.5,'03/01/09','5'
|
||||
56,true,6,6,6,60,6.6,60.6,'03/01/09','6'
|
||||
57,false,7,7,7,70,7.7,70.7,'03/01/09','7'
|
||||
58,true,8,8,8,80,8.8,80.8,'03/01/09','8'
|
||||
59,false,9,9,9,90,9.9,90.9,'03/01/09','9'
|
||||
60,true,0,0,0,0,0,0,'03/02/09','0'
|
||||
61,false,1,1,1,10,1.1,10.1,'03/02/09','1'
|
||||
62,true,2,2,2,20,2.2,20.2,'03/02/09','2'
|
||||
63,false,3,3,3,30,3.3,30.3,'03/02/09','3'
|
||||
64,true,4,4,4,40,4.4,40.4,'03/02/09','4'
|
||||
65,false,5,5,5,50,5.5,50.5,'03/02/09','5'
|
||||
66,true,6,6,6,60,6.6,60.6,'03/02/09','6'
|
||||
67,false,7,7,7,70,7.7,70.7,'03/02/09','7'
|
||||
68,true,8,8,8,80,8.8,80.8,'03/02/09','8'
|
||||
69,false,9,9,9,90,9.9,90.9,'03/02/09','9'
|
||||
70,true,0,0,0,0,0,0,'03/03/09','0'
|
||||
71,false,1,1,1,10,1.1,10.1,'03/03/09','1'
|
||||
72,true,2,2,2,20,2.2,20.2,'03/03/09','2'
|
||||
73,false,3,3,3,30,3.3,30.3,'03/03/09','3'
|
||||
74,true,4,4,4,40,4.4,40.4,'03/03/09','4'
|
||||
75,false,0,0,0,0,0,0,'04/01/09','0'
|
||||
76,true,1,1,1,10,1.1,10.1,'04/01/09','1'
|
||||
77,false,2,2,2,20,2.2,20.2,'04/01/09','2'
|
||||
78,true,3,3,3,30,3.3,30.3,'04/01/09','3'
|
||||
79,false,4,4,4,40,4.4,40.4,'04/01/09','4'
|
||||
80,true,5,5,5,50,5.5,50.5,'04/01/09','5'
|
||||
81,false,6,6,6,60,6.6,60.6,'04/01/09','6'
|
||||
82,true,7,7,7,70,7.7,70.7,'04/01/09','7'
|
||||
83,false,8,8,8,80,8.8,80.8,'04/01/09','8'
|
||||
84,true,9,9,9,90,9.9,90.9,'04/01/09','9'
|
||||
85,false,0,0,0,0,0,0,'04/02/09','0'
|
||||
86,true,1,1,1,10,1.1,10.1,'04/02/09','1'
|
||||
87,false,2,2,2,20,2.2,20.2,'04/02/09','2'
|
||||
88,true,3,3,3,30,3.3,30.3,'04/02/09','3'
|
||||
89,false,4,4,4,40,4.4,40.4,'04/02/09','4'
|
||||
90,true,5,5,5,50,5.5,50.5,'04/02/09','5'
|
||||
91,false,6,6,6,60,6.6,60.6,'04/02/09','6'
|
||||
92,true,7,7,7,70,7.7,70.7,'04/02/09','7'
|
||||
93,false,8,8,8,80,8.8,80.8,'04/02/09','8'
|
||||
94,true,9,9,9,90,9.9,90.9,'04/02/09','9'
|
||||
95,false,0,0,0,0,0,0,'04/03/09','0'
|
||||
96,true,1,1,1,10,1.1,10.1,'04/03/09','1'
|
||||
97,false,2,2,2,20,2.2,20.2,'04/03/09','2'
|
||||
98,true,3,3,3,30,3.3,30.3,'04/03/09','3'
|
||||
99,false,4,4,4,40,4.4,40.4,'04/03/09','4'
|
||||
=====
|
||||
select id from alltypessmall_rc
|
||||
-----
|
||||
int
|
||||
-----
|
||||
0
|
||||
1
|
||||
2
|
||||
3
|
||||
4
|
||||
5
|
||||
6
|
||||
7
|
||||
8
|
||||
9
|
||||
10
|
||||
11
|
||||
12
|
||||
13
|
||||
14
|
||||
15
|
||||
16
|
||||
17
|
||||
18
|
||||
19
|
||||
20
|
||||
21
|
||||
22
|
||||
23
|
||||
24
|
||||
25
|
||||
26
|
||||
27
|
||||
28
|
||||
29
|
||||
30
|
||||
31
|
||||
32
|
||||
33
|
||||
34
|
||||
35
|
||||
36
|
||||
37
|
||||
38
|
||||
39
|
||||
40
|
||||
41
|
||||
42
|
||||
43
|
||||
44
|
||||
45
|
||||
46
|
||||
47
|
||||
48
|
||||
49
|
||||
50
|
||||
51
|
||||
52
|
||||
53
|
||||
54
|
||||
55
|
||||
56
|
||||
57
|
||||
58
|
||||
59
|
||||
60
|
||||
61
|
||||
62
|
||||
63
|
||||
64
|
||||
65
|
||||
66
|
||||
67
|
||||
68
|
||||
69
|
||||
70
|
||||
71
|
||||
72
|
||||
73
|
||||
74
|
||||
75
|
||||
76
|
||||
77
|
||||
78
|
||||
79
|
||||
80
|
||||
81
|
||||
82
|
||||
83
|
||||
84
|
||||
85
|
||||
86
|
||||
87
|
||||
88
|
||||
89
|
||||
90
|
||||
91
|
||||
92
|
||||
93
|
||||
94
|
||||
95
|
||||
96
|
||||
97
|
||||
98
|
||||
99
|
||||
=====
|
||||
select * from alltypessmall_rc
|
||||
-----
|
||||
int,int,int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp
|
||||
-----
|
||||
2009,1,0,true,0,0,0,0,0,0,'01/01/09','0',2009-01-01 00:00:00
|
||||
2009,1,1,false,1,1,1,10,1.1,10.1,'01/01/09','1',2009-01-01 00:01:00
|
||||
2009,1,10,true,0,0,0,0,0,0,'01/02/09','0',2009-01-02 00:10:00.450000000
|
||||
2009,1,11,false,1,1,1,10,1.1,10.1,'01/02/09','1',2009-01-02 00:11:00.450000000
|
||||
2009,1,12,true,2,2,2,20,2.2,20.2,'01/02/09','2',2009-01-02 00:12:00.460000000
|
||||
2009,1,13,false,3,3,3,30,3.3,30.3,'01/02/09','3',2009-01-02 00:13:00.480000000
|
||||
2009,1,14,true,4,4,4,40,4.4,40.4,'01/02/09','4',2009-01-02 00:14:00.510000000
|
||||
2009,1,15,false,5,5,5,50,5.5,50.5,'01/02/09','5',2009-01-02 00:15:00.550000000
|
||||
2009,1,16,true,6,6,6,60,6.6,60.6,'01/02/09','6',2009-01-02 00:16:00.600000000
|
||||
2009,1,17,false,7,7,7,70,7.7,70.7,'01/02/09','7',2009-01-02 00:17:00.660000000
|
||||
2009,1,18,true,8,8,8,80,8.8,80.8,'01/02/09','8',2009-01-02 00:18:00.730000000
|
||||
2009,1,19,false,9,9,9,90,9.9,90.9,'01/02/09','9',2009-01-02 00:19:00.810000000
|
||||
2009,1,2,true,2,2,2,20,2.2,20.2,'01/01/09','2',2009-01-01 00:02:00.100000000
|
||||
2009,1,20,true,0,0,0,0,0,0,'01/03/09','0',2009-01-03 00:20:00.900000000
|
||||
2009,1,21,false,1,1,1,10,1.1,10.1,'01/03/09','1',2009-01-03 00:21:00.900000000
|
||||
2009,1,22,true,2,2,2,20,2.2,20.2,'01/03/09','2',2009-01-03 00:22:00.910000000
|
||||
2009,1,23,false,3,3,3,30,3.3,30.3,'01/03/09','3',2009-01-03 00:23:00.930000000
|
||||
2009,1,24,true,4,4,4,40,4.4,40.4,'01/03/09','4',2009-01-03 00:24:00.960000000
|
||||
2009,1,3,false,3,3,3,30,3.3,30.3,'01/01/09','3',2009-01-01 00:03:00.300000000
|
||||
2009,1,4,true,4,4,4,40,4.4,40.4,'01/01/09','4',2009-01-01 00:04:00.600000000
|
||||
2009,1,5,false,5,5,5,50,5.5,50.5,'01/01/09','5',2009-01-01 00:05:00.100000000
|
||||
2009,1,6,true,6,6,6,60,6.6,60.6,'01/01/09','6',2009-01-01 00:06:00.150000000
|
||||
2009,1,7,false,7,7,7,70,7.7,70.7,'01/01/09','7',2009-01-01 00:07:00.210000000
|
||||
2009,1,8,true,8,8,8,80,8.8,80.8,'01/01/09','8',2009-01-01 00:08:00.280000000
|
||||
2009,1,9,false,9,9,9,90,9.9,90.9,'01/01/09','9',2009-01-01 00:09:00.360000000
|
||||
2009,2,25,false,0,0,0,0,0,0,'02/01/09','0',2009-02-01 00:00:00
|
||||
2009,2,26,true,1,1,1,10,1.1,10.1,'02/01/09','1',2009-02-01 00:01:00
|
||||
2009,2,27,false,2,2,2,20,2.2,20.2,'02/01/09','2',2009-02-01 00:02:00.100000000
|
||||
2009,2,28,true,3,3,3,30,3.3,30.3,'02/01/09','3',2009-02-01 00:03:00.300000000
|
||||
2009,2,29,false,4,4,4,40,4.4,40.4,'02/01/09','4',2009-02-01 00:04:00.600000000
|
||||
2009,2,30,true,5,5,5,50,5.5,50.5,'02/01/09','5',2009-02-01 00:05:00.100000000
|
||||
2009,2,31,false,6,6,6,60,6.6,60.6,'02/01/09','6',2009-02-01 00:06:00.150000000
|
||||
2009,2,32,true,7,7,7,70,7.7,70.7,'02/01/09','7',2009-02-01 00:07:00.210000000
|
||||
2009,2,33,false,8,8,8,80,8.8,80.8,'02/01/09','8',2009-02-01 00:08:00.280000000
|
||||
2009,2,34,true,9,9,9,90,9.9,90.9,'02/01/09','9',2009-02-01 00:09:00.360000000
|
||||
2009,2,35,false,0,0,0,0,0,0,'02/02/09','0',2009-02-02 00:10:00.450000000
|
||||
2009,2,36,true,1,1,1,10,1.1,10.1,'02/02/09','1',2009-02-02 00:11:00.450000000
|
||||
2009,2,37,false,2,2,2,20,2.2,20.2,'02/02/09','2',2009-02-02 00:12:00.460000000
|
||||
2009,2,38,true,3,3,3,30,3.3,30.3,'02/02/09','3',2009-02-02 00:13:00.480000000
|
||||
2009,2,39,false,4,4,4,40,4.4,40.4,'02/02/09','4',2009-02-02 00:14:00.510000000
|
||||
2009,2,40,true,5,5,5,50,5.5,50.5,'02/02/09','5',2009-02-02 00:15:00.550000000
|
||||
2009,2,41,false,6,6,6,60,6.6,60.6,'02/02/09','6',2009-02-02 00:16:00.600000000
|
||||
2009,2,42,true,7,7,7,70,7.7,70.7,'02/02/09','7',2009-02-02 00:17:00.660000000
|
||||
2009,2,43,false,8,8,8,80,8.8,80.8,'02/02/09','8',2009-02-02 00:18:00.730000000
|
||||
2009,2,44,true,9,9,9,90,9.9,90.9,'02/02/09','9',2009-02-02 00:19:00.810000000
|
||||
2009,2,45,false,0,0,0,0,0,0,'02/03/09','0',2009-02-03 00:20:00.900000000
|
||||
2009,2,46,true,1,1,1,10,1.1,10.1,'02/03/09','1',2009-02-03 00:21:00.900000000
|
||||
2009,2,47,false,2,2,2,20,2.2,20.2,'02/03/09','2',2009-02-03 00:22:00.910000000
|
||||
2009,2,48,true,3,3,3,30,3.3,30.3,'02/03/09','3',2009-02-03 00:23:00.930000000
|
||||
2009,2,49,false,4,4,4,40,4.4,40.4,'02/03/09','4',2009-02-03 00:24:00.960000000
|
||||
2009,3,50,true,0,0,0,0,0,0,'03/01/09','0',2009-03-01 00:00:00
|
||||
2009,3,51,false,1,1,1,10,1.1,10.1,'03/01/09','1',2009-03-01 00:01:00
|
||||
2009,3,52,true,2,2,2,20,2.2,20.2,'03/01/09','2',2009-03-01 00:02:00.100000000
|
||||
2009,3,53,false,3,3,3,30,3.3,30.3,'03/01/09','3',2009-03-01 00:03:00.300000000
|
||||
2009,3,54,true,4,4,4,40,4.4,40.4,'03/01/09','4',2009-03-01 00:04:00.600000000
|
||||
2009,3,55,false,5,5,5,50,5.5,50.5,'03/01/09','5',2009-03-01 00:05:00.100000000
|
||||
2009,3,56,true,6,6,6,60,6.6,60.6,'03/01/09','6',2009-03-01 00:06:00.150000000
|
||||
2009,3,57,false,7,7,7,70,7.7,70.7,'03/01/09','7',2009-03-01 00:07:00.210000000
|
||||
2009,3,58,true,8,8,8,80,8.8,80.8,'03/01/09','8',2009-03-01 00:08:00.280000000
|
||||
2009,3,59,false,9,9,9,90,9.9,90.9,'03/01/09','9',2009-03-01 00:09:00.360000000
|
||||
2009,3,60,true,0,0,0,0,0,0,'03/02/09','0',2009-03-02 00:10:00.450000000
|
||||
2009,3,61,false,1,1,1,10,1.1,10.1,'03/02/09','1',2009-03-02 00:11:00.450000000
|
||||
2009,3,62,true,2,2,2,20,2.2,20.2,'03/02/09','2',2009-03-02 00:12:00.460000000
|
||||
2009,3,63,false,3,3,3,30,3.3,30.3,'03/02/09','3',2009-03-02 00:13:00.480000000
|
||||
2009,3,64,true,4,4,4,40,4.4,40.4,'03/02/09','4',2009-03-02 00:14:00.510000000
|
||||
2009,3,65,false,5,5,5,50,5.5,50.5,'03/02/09','5',2009-03-02 00:15:00.550000000
|
||||
2009,3,66,true,6,6,6,60,6.6,60.6,'03/02/09','6',2009-03-02 00:16:00.600000000
|
||||
2009,3,67,false,7,7,7,70,7.7,70.7,'03/02/09','7',2009-03-02 00:17:00.660000000
|
||||
2009,3,68,true,8,8,8,80,8.8,80.8,'03/02/09','8',2009-03-02 00:18:00.730000000
|
||||
2009,3,69,false,9,9,9,90,9.9,90.9,'03/02/09','9',2009-03-02 00:19:00.810000000
|
||||
2009,3,70,true,0,0,0,0,0,0,'03/03/09','0',2009-03-03 00:20:00.900000000
|
||||
2009,3,71,false,1,1,1,10,1.1,10.1,'03/03/09','1',2009-03-03 00:21:00.900000000
|
||||
2009,3,72,true,2,2,2,20,2.2,20.2,'03/03/09','2',2009-03-03 00:22:00.910000000
|
||||
2009,3,73,false,3,3,3,30,3.3,30.3,'03/03/09','3',2009-03-03 00:23:00.930000000
|
||||
2009,3,74,true,4,4,4,40,4.4,40.4,'03/03/09','4',2009-03-03 00:24:00.960000000
|
||||
2009,4,75,false,0,0,0,0,0,0,'04/01/09','0',2009-04-01 00:00:00
|
||||
2009,4,76,true,1,1,1,10,1.1,10.1,'04/01/09','1',2009-04-01 00:01:00
|
||||
2009,4,77,false,2,2,2,20,2.2,20.2,'04/01/09','2',2009-04-01 00:02:00.100000000
|
||||
2009,4,78,true,3,3,3,30,3.3,30.3,'04/01/09','3',2009-04-01 00:03:00.300000000
|
||||
2009,4,79,false,4,4,4,40,4.4,40.4,'04/01/09','4',2009-04-01 00:04:00.600000000
|
||||
2009,4,80,true,5,5,5,50,5.5,50.5,'04/01/09','5',2009-04-01 00:05:00.100000000
|
||||
2009,4,81,false,6,6,6,60,6.6,60.6,'04/01/09','6',2009-04-01 00:06:00.150000000
|
||||
2009,4,82,true,7,7,7,70,7.7,70.7,'04/01/09','7',2009-04-01 00:07:00.210000000
|
||||
2009,4,83,false,8,8,8,80,8.8,80.8,'04/01/09','8',2009-04-01 00:08:00.280000000
|
||||
2009,4,84,true,9,9,9,90,9.9,90.9,'04/01/09','9',2009-04-01 00:09:00.360000000
|
||||
2009,4,85,false,0,0,0,0,0,0,'04/02/09','0',2009-04-02 00:10:00.450000000
|
||||
2009,4,86,true,1,1,1,10,1.1,10.1,'04/02/09','1',2009-04-02 00:11:00.450000000
|
||||
2009,4,87,false,2,2,2,20,2.2,20.2,'04/02/09','2',2009-04-02 00:12:00.460000000
|
||||
2009,4,88,true,3,3,3,30,3.3,30.3,'04/02/09','3',2009-04-02 00:13:00.480000000
|
||||
2009,4,89,false,4,4,4,40,4.4,40.4,'04/02/09','4',2009-04-02 00:14:00.510000000
|
||||
2009,4,90,true,5,5,5,50,5.5,50.5,'04/02/09','5',2009-04-02 00:15:00.550000000
|
||||
2009,4,91,false,6,6,6,60,6.6,60.6,'04/02/09','6',2009-04-02 00:16:00.600000000
|
||||
2009,4,92,true,7,7,7,70,7.7,70.7,'04/02/09','7',2009-04-02 00:17:00.660000000
|
||||
2009,4,93,false,8,8,8,80,8.8,80.8,'04/02/09','8',2009-04-02 00:18:00.730000000
|
||||
2009,4,94,true,9,9,9,90,9.9,90.9,'04/02/09','9',2009-04-02 00:19:00.810000000
|
||||
2009,4,95,false,0,0,0,0,0,0,'04/03/09','0',2009-04-03 00:20:00.900000000
|
||||
2009,4,96,true,1,1,1,10,1.1,10.1,'04/03/09','1',2009-04-03 00:21:00.900000000
|
||||
2009,4,97,false,2,2,2,20,2.2,20.2,'04/03/09','2',2009-04-03 00:22:00.910000000
|
||||
2009,4,98,true,3,3,3,30,3.3,30.3,'04/03/09','3',2009-04-03 00:23:00.930000000
|
||||
2009,4,99,false,4,4,4,40,4.4,40.4,'04/03/09','4',2009-04-03 00:24:00.960000000
|
||||
=====
|
||||
select month, date_string_col, year from alltypessmall_rc
|
||||
-----
|
||||
int,string,int
|
||||
-----
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/01/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/02/09',2009
|
||||
1,'01/03/09',2009
|
||||
1,'01/03/09',2009
|
||||
1,'01/03/09',2009
|
||||
1,'01/03/09',2009
|
||||
1,'01/03/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/01/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/02/09',2009
|
||||
2,'02/03/09',2009
|
||||
2,'02/03/09',2009
|
||||
2,'02/03/09',2009
|
||||
2,'02/03/09',2009
|
||||
2,'02/03/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/01/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/02/09',2009
|
||||
3,'03/03/09',2009
|
||||
3,'03/03/09',2009
|
||||
3,'03/03/09',2009
|
||||
3,'03/03/09',2009
|
||||
3,'03/03/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/01/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/02/09',2009
|
||||
4,'04/03/09',2009
|
||||
4,'04/03/09',2009
|
||||
4,'04/03/09',2009
|
||||
4,'04/03/09',2009
|
||||
4,'04/03/09',2009
|
||||
========
|
||||
select id from alltypessmall_rc where id = 10
|
||||
----
|
||||
int
|
||||
----
|
||||
10
|
||||
====
|
||||
# We expect that conversion errors are turned into NULLs
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col
|
||||
from alltypeserror_rc
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
-----
|
||||
0,NULL,NULL,0,0,0,0,0,'01/01/09','0'
|
||||
1,NULL,NULL,1,1,10,1,10.1,'01/01/09','1'
|
||||
2,true,NULL,NULL,2,20,2,20.2,'01/01/09','2'
|
||||
3,false,3,NULL,NULL,30,3,30.3,'01/01/09','3'
|
||||
4,true,4,4,NULL,NULL,4,40.4,'01/01/09','4'
|
||||
5,false,5,5,5,NULL,NULL,50.5,'01/01/09','5'
|
||||
6,true,6,6,6,60,NULL,NULL,'01/01/09','6'
|
||||
7,NULL,NULL,7,7,70,7,NULL,'01/01/09','7'
|
||||
8,false,NULL,NULL,8,80,8,80.8,'01/01/09','8'
|
||||
9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/09','9'
|
||||
10,NULL,NULL,NULL,0,0,0,0,'02/01/09','0'
|
||||
11,false,NULL,NULL,NULL,10,1,10.1,'02/01/09','1'
|
||||
12,true,2,NULL,NULL,NULL,2,20.2,'02/01/09','2'
|
||||
13,false,3,3,NULL,NULL,NULL,NULL,'02/01/09','3'
|
||||
14,true,4,4,4,40,NULL,NULL,'02/01/09','4'
|
||||
15,false,NULL,5,5,50,5,50.5,'02/01/09','5'
|
||||
16,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'02/01/09','6'
|
||||
17,false,7,7,7,70,7,NULL,'02/01/09','7'
|
||||
18,true,8,8,8,80,8,80.8,'02/01/09','8'
|
||||
19,false,9,9,9,90,9,90.9,'02/01/09','9'
|
||||
20,true,0,0,0,0,0,0,'03/01/09','0'
|
||||
21,false,1,1,1,10,1,10.1,'03/01/09','1'
|
||||
22,true,2,2,2,20,2,20.2,'03/01/09','2'
|
||||
23,false,3,NULL,3,30,3,30.3,'03/01/09','3'
|
||||
24,true,4,4,4,40,4,40.4,'03/01/09','4'
|
||||
25,false,5,5,NULL,50,5,50.5,'03/01/09','5'
|
||||
26,true,6,6,6,60,6,60.6,'03/01/09','6'
|
||||
27,false,NULL,7,7,70,7,70.7,'03/01/09','7'
|
||||
28,true,8,8,8,80,8,80.8,'03/01/09','8'
|
||||
29,false,9,9,NULL,90,9,90.9,'03/01/09','9'
|
||||
====
|
||||
# We expect that conversion errors are turned into NULLs
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col
|
||||
from alltypeserrornonulls_rc
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
-----
|
||||
0,true,0,0,0,0,0,0,'01/01/09','0'
|
||||
1,NULL,1,1,1,10,1,10.1,'01/01/09','1'
|
||||
2,true,NULL,2,2,20,2,20.2,'01/01/09','2'
|
||||
3,false,3,NULL,3,30,3,30.3,'01/01/09','3'
|
||||
4,true,4,4,NULL,40,4,40.4,'01/01/09','4'
|
||||
5,false,5,5,5,NULL,5,50.5,'01/01/09','5'
|
||||
6,true,6,6,6,60,NULL,60.6,'01/01/09','6'
|
||||
7,false,7,7,7,70,7,NULL,'01/01/09','7'
|
||||
8,false,8,8,8,80,8,80.8,'01/01/09','8'
|
||||
9,NULL,NULL,NULL,NULL,NULL,NULL,NULL,'01/01/09','9'
|
||||
10,true,0,0,0,0,0,0,'02/01/09','0'
|
||||
11,false,1,1,1,10,1,10.1,'02/01/09','1'
|
||||
12,true,2,2,2,20,2,20.2,'02/01/09','2'
|
||||
13,false,3,3,3,30,NULL,NULL,'02/01/09','3'
|
||||
14,true,4,4,4,40,4,40.4,'02/01/09','4'
|
||||
15,false,NULL,5,5,50,5,50.5,'02/01/09','5'
|
||||
16,true,6,6,6,60,6,60.6,'02/01/09','6'
|
||||
17,false,7,7,7,70,7,NULL,'02/01/09','7'
|
||||
18,true,8,8,8,80,8,80.8,'02/01/09','8'
|
||||
19,false,9,9,9,90,9,90.9,'02/01/09','9'
|
||||
20,true,0,0,0,0,0,0,'03/01/09','0'
|
||||
21,false,1,1,1,10,1,10.1,'03/01/09','1'
|
||||
22,true,2,2,2,20,2,20.2,'03/01/09','2'
|
||||
23,false,3,NULL,3,30,3,30.3,'03/01/09','3'
|
||||
24,true,4,4,4,40,4,40.4,'03/01/09','4'
|
||||
25,false,5,5,NULL,50,5,50.5,'03/01/09','5'
|
||||
26,true,6,6,6,60,6,60.6,'03/01/09','6'
|
||||
27,false,NULL,7,7,70,7,70.7,'03/01/09','7'
|
||||
28,true,8,8,8,80,8,80.8,'03/01/09','8'
|
||||
29,false,9,9,NULL,90,9,90.9,'03/01/09','9'
|
||||
====
|
||||
# Test multi files non-partitioned table (rc)
|
||||
select count(*) from alltypesaggmultifilesnopart_rc
|
||||
----
|
||||
bigint
|
||||
----
|
||||
10000
|
||||
====
|
||||
@@ -1,6 +1,6 @@
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col,
|
||||
double_col, date_string_col, string_col, timestamp_col
|
||||
from alltypessmall
|
||||
from alltypessmall$TABLE
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp
|
||||
-----
|
||||
@@ -105,7 +105,7 @@ int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp
|
||||
98,true,3,3,3,30,3.3,30.3,'04/03/09','3',2009-04-03 00:23:00.930000000
|
||||
99,false,4,4,4,40,4.4,40.4,'04/03/09','4',2009-04-03 00:24:00.960000000
|
||||
=====
|
||||
select id from alltypessmall
|
||||
select id from alltypessmall$TABLE
|
||||
-----
|
||||
int
|
||||
-----
|
||||
@@ -210,7 +210,7 @@ int
|
||||
98
|
||||
99
|
||||
=====
|
||||
select * from alltypessmall
|
||||
select * from alltypessmall$TABLE
|
||||
-----
|
||||
int,int,int,boolean,tinyint,smallint,int,bigint,float,double,string,string,timestamp
|
||||
-----
|
||||
@@ -315,7 +315,7 @@ int,int,int,boolean,tinyint,smallint,int,bigint,float,double,string,string,times
|
||||
2009,4,98,true,3,3,3,30,3.3,30.3,'04/03/09','3',2009-04-03 00:23:00.930000000
|
||||
2009,4,99,false,4,4,4,40,4.4,40.4,'04/03/09','4',2009-04-03 00:24:00.960000000
|
||||
=====
|
||||
select month, date_string_col, year from alltypessmall
|
||||
select month, date_string_col, year from alltypessmall$TABLE
|
||||
-----
|
||||
int,string,int
|
||||
-----
|
||||
@@ -420,7 +420,7 @@ int,string,int
|
||||
4,'04/03/09',2009
|
||||
4,'04/03/09',2009
|
||||
========
|
||||
select id from alltypessmall where id = 10
|
||||
select id from alltypessmall$TABLE where id = 10
|
||||
----
|
||||
int
|
||||
----
|
||||
@@ -428,7 +428,7 @@ int
|
||||
====
|
||||
# We expect that conversion errors are turned into NULLs
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col
|
||||
from alltypeserror
|
||||
from alltypeserror$TABLE
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
-----
|
||||
@@ -465,7 +465,7 @@ int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
====
|
||||
# We expect that conversion errors are turned into NULLs
|
||||
select id, bool_col, tinyint_col, smallint_col, int_col, bigint_col, float_col, double_col, date_string_col, string_col
|
||||
from alltypeserrornonulls
|
||||
from alltypeserrornonulls$TABLE
|
||||
-----
|
||||
int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
-----
|
||||
@@ -502,7 +502,7 @@ int,boolean,tinyint,smallint,int,bigint,float,double,string,string
|
||||
====
|
||||
# partition key values are materialized correctly across file boundaries
|
||||
select day, month, year, string_col
|
||||
from alltypesagg
|
||||
from alltypesagg$TABLE
|
||||
where string_col = '0'
|
||||
-----
|
||||
int, int, int, string
|
||||
@@ -539,7 +539,7 @@ tinyint, smallint, int, bigint, float, double
|
||||
-128,-32768,-2147483648,-9223372036854775808,-inf,-inf
|
||||
====
|
||||
# Test multi files non-partitioned table (hdfs)
|
||||
select count(*) from AllTypesAggMultiFilesNoPart
|
||||
select count(*) from AllTypesAggMultiFilesNoPart$TABLE
|
||||
----
|
||||
bigint
|
||||
----
|
||||
|
||||
18
testdata/bin/create-benchmark.sql
vendored
18
testdata/bin/create-benchmark.sql
vendored
@@ -28,3 +28,21 @@ CREATE TABLE UserVisits (
|
||||
avgTimeOnSite int)
|
||||
row format delimited fields terminated by '|' stored as textfile;
|
||||
|
||||
DROP TABLE IF EXISTS UserVisits_seq;
|
||||
CREATE TABLE UserVisits_seq (
|
||||
sourceIP string,
|
||||
destURL string,
|
||||
visitDate string,
|
||||
adRevenue float,
|
||||
userAgent string,
|
||||
cCode string,
|
||||
lCode string,
|
||||
sKeyword string,
|
||||
avgTimeOnSite int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS Grep1GB_seq_snap;
|
||||
CREATE TABLE Grep1GB_seq_snap (
|
||||
field string)
|
||||
partitioned by (chunk int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
499
testdata/bin/create.sql
vendored
499
testdata/bin/create.sql
vendored
@@ -45,24 +45,146 @@ CREATE TABLE AllTypes_rc (
|
||||
partitioned by (year int, month int)
|
||||
STORED AS RCFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq;
|
||||
CREATE TABLE AllTypes_seq (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_def;
|
||||
CREATE TABLE AllTypes_seq_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_gzip;
|
||||
CREATE TABLE AllTypes_seq_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_bzip;
|
||||
CREATE TABLE AllTypes_seq_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_snap;
|
||||
CREATE TABLE AllTypes_seq_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_record_def;
|
||||
CREATE TABLE AllTypes_seq_record_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_record_gzip;
|
||||
CREATE TABLE AllTypes_seq_record_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_record_bzip;
|
||||
CREATE TABLE AllTypes_seq_record_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypes_seq_record_snap;
|
||||
CREATE TABLE AllTypes_seq_record_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall;
|
||||
CREATE TABLE AllTypesSmall LIKE AllTypes;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_rc;
|
||||
CREATE TABLE AllTypesSmall_rc LIKE AllTypes_rc;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq;
|
||||
CREATE TABLE AllTypesSmall_seq LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_def;
|
||||
CREATE TABLE AllTypesSmall_seq_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_gzip;
|
||||
CREATE TABLE AllTypesSmall_seq_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_bzip;
|
||||
CREATE TABLE AllTypesSmall_seq_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_snap;
|
||||
CREATE TABLE AllTypesSmall_seq_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_record_def;
|
||||
CREATE TABLE AllTypesSmall_seq_record_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_record_gzip;
|
||||
CREATE TABLE AllTypesSmall_seq_record_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_record_bzip;
|
||||
CREATE TABLE AllTypesSmall_seq_record_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesSmall_seq_record_snap;
|
||||
CREATE TABLE AllTypesSmall_seq_record_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError;
|
||||
CREATE TABLE AllTypesError LIKE AllTypes;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_rc;
|
||||
CREATE TABLE AllTypesError_rc LIKE AllTypes_rc;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq;
|
||||
CREATE TABLE AllTypesError_seq LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_def;
|
||||
CREATE TABLE AllTypesError_seq_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_gzip;
|
||||
CREATE TABLE AllTypesError_seq_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_bzip;
|
||||
CREATE TABLE AllTypesError_seq_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_snap;
|
||||
CREATE TABLE AllTypesError_seq_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_record_def;
|
||||
CREATE TABLE AllTypesError_seq_record_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_record_gzip;
|
||||
CREATE TABLE AllTypesError_seq_record_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_record_bzip;
|
||||
CREATE TABLE AllTypesError_seq_record_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesError_seq_record_snap;
|
||||
CREATE TABLE AllTypesError_seq_record_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls;
|
||||
CREATE TABLE AllTypesErrorNoNulls LIKE AllTypes;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_rc;
|
||||
CREATE TABLE AllTypesErrorNoNulls_rc LIKE AllTypes_rc;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_def;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_gzip;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_bzip;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_snap;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_record_def;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_record_def LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_record_gzip;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_record_gzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_record_bzip;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_record_bzip LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AlltypesErrorNoNulls_seq_record_snap;
|
||||
CREATE TABLE AllTypesErrorNoNulls_seq_record_snap LIKE AllTypes_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg;
|
||||
CREATE TABLE AllTypesAgg (
|
||||
id int,
|
||||
@@ -95,12 +217,79 @@ CREATE TABLE AllTypesAgg_rc (
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS RCFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq;
|
||||
CREATE TABLE AllTypesAgg_seq (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_def;
|
||||
CREATE TABLE AllTypesAgg_seq_def LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_gzip;
|
||||
CREATE TABLE AllTypesAgg_seq_gzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_bzip;
|
||||
CREATE TABLE AllTypesAgg_seq_bzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_snap;
|
||||
CREATE TABLE AllTypesAgg_seq_snap LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_record_def;
|
||||
CREATE TABLE AllTypesAgg_seq_record_def LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_record_gzip;
|
||||
CREATE TABLE AllTypesAgg_seq_record_gzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_record_bzip;
|
||||
CREATE TABLE AllTypesAgg_seq_record_bzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAgg_seq_record_snap;
|
||||
CREATE TABLE AllTypesAgg_seq_record_snap LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls;
|
||||
CREATE TABLE AllTypesAggNoNulls LIKE AllTypesAgg;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_rc;
|
||||
CREATE TABLE AllTypesAggNoNulls_rc LIKE AllTypesAgg_rc;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_def;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_def LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_gzip;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_gzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_bzip;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_bzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_snap;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_snap LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_record_def;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_record_def LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_record_gzip;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_record_gzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_record_bzip;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_record_bzip LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggNoNulls_seq_record_snap;
|
||||
CREATE TABLE AllTypesAggNoNulls_seq_record_snap LIKE AllTypesAgg_seq;
|
||||
|
||||
DROP TABLE IF EXISTS DelimErrorTable;
|
||||
CREATE TABLE DelimErrorTable (
|
||||
id int,
|
||||
@@ -126,6 +315,37 @@ CREATE TABLE TestTbl_rc (
|
||||
zip int)
|
||||
STORED AS RCFILE;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq;
|
||||
CREATE TABLE TestTbl_seq (
|
||||
id bigint,
|
||||
name string,
|
||||
zip int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_def;
|
||||
CREATE TABLE TestTbl_seq_def LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_gzip;
|
||||
CREATE TABLE TestTbl_seq_gzip LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_bzip;
|
||||
CREATE TABLE TestTbl_seq_bzip LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_snap;
|
||||
CREATE TABLE TestTbl_seq_snap LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_record_def;
|
||||
CREATE TABLE TestTbl_seq_record_def LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_record_gzip;
|
||||
CREATE TABLE TestTbl_seq_record_gzip LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_record_bzip;
|
||||
CREATE TABLE TestTbl_seq_record_bzip LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS TestTbl_seq_record_snap;
|
||||
CREATE TABLE TestTbl_seq_record_snap LIKE TestTbl_seq;
|
||||
|
||||
DROP TABLE IF EXISTS DimTbl;
|
||||
CREATE TABLE DimTbl (
|
||||
id bigint,
|
||||
@@ -368,6 +588,150 @@ CREATE TABLE AllTypesAggMultiFiles_rc (
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS RCFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_def;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_def (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_gzip;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_gzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_bzip;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_bzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_snap;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_snap (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_record_def;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_record_def (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_record_gzip;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_record_gzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_record_bzip;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_record_bzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFiles_seq_record_snap;
|
||||
CREATE TABLE AllTypesAggMultiFiles_seq_record_snap (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
partitioned by (year int, month int, day int)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart (
|
||||
id int,
|
||||
@@ -411,3 +775,138 @@ CREATE TABLE AllTypesAggMultiFilesNoPart_rc (
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS RCFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_def;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_def (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_gzip;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_gzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_bzip;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_bzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_snap;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_snap (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_record_def;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_record_def (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_record_gzip;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_record_gzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_record_bzip;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_record_bzip (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
DROP TABLE IF EXISTS AllTypesAggMultiFilesNoPart_seq_record_snap;
|
||||
CREATE TABLE AllTypesAggMultiFilesNoPart_seq_record_snap (
|
||||
id int,
|
||||
bool_col boolean,
|
||||
tinyint_col tinyint,
|
||||
smallint_col smallint,
|
||||
int_col int,
|
||||
bigint_col bigint,
|
||||
float_col float,
|
||||
double_col double,
|
||||
date_string_col string,
|
||||
string_col string,
|
||||
timestamp_col timestamp)
|
||||
STORED AS SEQUENCEFILE;
|
||||
|
||||
8
testdata/bin/load-benchmark.sql
vendored
8
testdata/bin/load-benchmark.sql
vendored
@@ -14,3 +14,11 @@ LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/hive_benchmark/grep10GB/part
|
||||
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/hive_benchmark/html1GB/Rankings.dat' OVERWRITE INTO TABLE Rankings;
|
||||
LOAD DATA LOCAL INPATH '${env:IMPALA_HOME}/testdata/hive_benchmark/html1GB/UserVisits.dat' OVERWRITE INTO TABLE UserVisits;
|
||||
|
||||
INSERT OVERWRITE TABLE UserVisits_seq SELECT * from UserVisits;
|
||||
|
||||
set hive.exec.dynamic.partition.mode=nonstrict;
|
||||
set hive.exec.dynamic.partition=true;
|
||||
SET hive.exec.compress.output=true;
|
||||
set mapred.output.compression.type=BLOCK;
|
||||
INSERT OVERWRITE TABLE Grep1GB_seq_snap PARTITION (chunk) select * from Grep1GB;
|
||||
|
||||
1030
testdata/bin/load-raw-data.sql
vendored
Normal file
1030
testdata/bin/load-raw-data.sql
vendored
Normal file
File diff suppressed because it is too large
Load Diff
1250
testdata/bin/load.sql
vendored
1250
testdata/bin/load.sql
vendored
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user