mirror of
https://github.com/apache/impala.git
synced 2026-01-17 21:00:36 -05:00
532 lines
20 KiB
C++
532 lines
20 KiB
C++
// Copyright 2012 Cloudera Inc.
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#include "exec/hdfs-sequence-scanner.h"
|
|
|
|
#include "codegen/llvm-codegen.h"
|
|
#include "exec/delimited-text-parser.inline.h"
|
|
#include "exec/hdfs-scan-node.h"
|
|
#include "exec/scan-range-context.h"
|
|
#include "exec/serde-utils.inline.h"
|
|
#include "exec/text-converter.inline.h"
|
|
#include "runtime/descriptors.h"
|
|
#include "runtime/runtime-state.h"
|
|
#include "runtime/tuple.h"
|
|
#include "runtime/tuple-row.h"
|
|
#include "util/codec.h"
|
|
|
|
using namespace boost;
|
|
using namespace impala;
|
|
using namespace llvm;
|
|
using namespace std;
|
|
|
|
const char* const HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME =
|
|
"org.apache.hadoop.io.Text";
|
|
|
|
const uint8_t HdfsSequenceScanner::SEQFILE_VERSION_HEADER[4] = {'S', 'E', 'Q', 6};
|
|
|
|
#define RETURN_IF_FALSE(x) if (UNLIKELY(!(x))) return parse_status_
|
|
|
|
HdfsSequenceScanner::HdfsSequenceScanner(HdfsScanNode* scan_node, RuntimeState* state)
|
|
: BaseSequenceScanner(scan_node, state),
|
|
unparsed_data_buffer_(NULL),
|
|
num_buffered_records_in_compressed_block_(0) {
|
|
}
|
|
|
|
HdfsSequenceScanner::~HdfsSequenceScanner() {
|
|
}
|
|
|
|
// Codegen for materialized parsed data into tuples.
|
|
// TODO: sequence file scanner needs to be split into a cross compiled ir file,
|
|
// probably just for the block compressed path. WriteCompleteTuple should be
|
|
// injected into that function.
|
|
Function* HdfsSequenceScanner::Codegen(HdfsScanNode* node) {
|
|
LlvmCodeGen* codegen = node->runtime_state()->llvm_codegen();
|
|
if (codegen == NULL) return NULL;
|
|
Function* write_complete_tuple_fn = CodegenWriteCompleteTuple(node, codegen);
|
|
if (write_complete_tuple_fn == NULL) return NULL;
|
|
return CodegenWriteAlignedTuples(node, codegen, write_complete_tuple_fn);
|
|
}
|
|
|
|
Status HdfsSequenceScanner::InitNewRange() {
|
|
DCHECK(header_ != NULL);
|
|
only_parsing_header_ = false;
|
|
|
|
HdfsPartitionDescriptor* hdfs_partition = context_->partition_descriptor();
|
|
|
|
text_converter_.reset(new TextConverter(hdfs_partition->escape_char()));
|
|
|
|
delimited_text_parser_.reset(new DelimitedTextParser(scan_node_, '\0',
|
|
hdfs_partition->field_delim(), hdfs_partition->collection_delim(),
|
|
hdfs_partition->escape_char()));
|
|
|
|
num_buffered_records_in_compressed_block_ = 0;
|
|
|
|
template_tuple_ = context_->template_tuple();
|
|
|
|
SeqFileHeader* seq_header = reinterpret_cast<SeqFileHeader*>(header_);
|
|
if (seq_header->is_compressed) {
|
|
// For record-compressed data we always want to copy since they tend to be
|
|
// small and occupy a bigger mempool chunk.
|
|
if (seq_header->is_row_compressed) context_->set_compact_data(true);
|
|
RETURN_IF_ERROR(Codec::CreateDecompressor(state_,
|
|
data_buffer_pool_.get(), context_->compact_data(),
|
|
header_->codec, &decompressor_));
|
|
}
|
|
|
|
// Initialize codegen fn
|
|
RETURN_IF_ERROR(InitializeCodegenFn(hdfs_partition,
|
|
THdfsFileFormat::SEQUENCE_FILE, "HdfsSequenceScanner"));
|
|
return Status::OK;
|
|
}
|
|
|
|
Status HdfsSequenceScanner::Prepare() {
|
|
RETURN_IF_ERROR(BaseSequenceScanner::Prepare());
|
|
|
|
// Allocate the scratch space for two pass parsing. The most fields we can go
|
|
// through in one parse pass is the batch size (tuples) * the number of fields per tuple
|
|
// TODO: This should probably be based on L2/L3 cache sizes (as should the batch size)
|
|
record_locations_.resize(state_->batch_size());
|
|
field_locations_.resize(state_->batch_size() * scan_node_->materialized_slots().size());
|
|
return Status::OK;
|
|
}
|
|
|
|
BaseSequenceScanner::FileHeader* HdfsSequenceScanner::AllocateFileHeader() {
|
|
return new SeqFileHeader;
|
|
}
|
|
|
|
inline Status HdfsSequenceScanner::GetRecord(uint8_t** record_ptr,
|
|
int64_t* record_len, bool *eosr) {
|
|
// There are 2 cases:
|
|
// Record-compressed -- like a regular record, but the data is compressed.
|
|
// Uncompressed.
|
|
|
|
block_start_ = context_->file_offset();
|
|
bool sync;
|
|
*eosr = context_->eosr();
|
|
Status stat = ReadBlockHeader(&sync);
|
|
if (!stat.ok()) {
|
|
*record_ptr = NULL;
|
|
if (*eosr) return Status::OK;
|
|
return stat;
|
|
}
|
|
|
|
// If we read a sync mark and are past the end of the scan range we are done.
|
|
if (sync && *eosr) {
|
|
*record_ptr = NULL;
|
|
return Status::OK;
|
|
}
|
|
|
|
// If we have not read the end the next sync mark keep going.
|
|
*eosr = false;
|
|
|
|
// We don't look at the keys, only the values.
|
|
RETURN_IF_FALSE(SerDeUtils::SkipBytes(context_, current_key_length_, &parse_status_));
|
|
|
|
if (header_->is_compressed) {
|
|
int in_size = current_block_length_ - current_key_length_;
|
|
// Check for a reasonable size
|
|
if (in_size > context_->scan_range()->len() || in_size < 0) {
|
|
stringstream ss;
|
|
ss << "Compressed record size is: " << in_size;
|
|
if (state_->LogHasSpace()) state_->LogError(ss.str());
|
|
return Status(ss.str());
|
|
}
|
|
uint8_t* compressed_data;
|
|
RETURN_IF_FALSE(
|
|
SerDeUtils::ReadBytes(context_, in_size, &compressed_data, &parse_status_));
|
|
|
|
int len = 0;
|
|
RETURN_IF_ERROR(decompressor_->ProcessBlock(in_size, compressed_data,
|
|
&len, &unparsed_data_buffer_));
|
|
*record_ptr = unparsed_data_buffer_;
|
|
// Read the length of the record.
|
|
int size = SerDeUtils::GetVLong(*record_ptr, record_len);
|
|
if (size == -1) {
|
|
stringstream ss;
|
|
ss << "Invalid record size";
|
|
if (state_->LogHasSpace()) state_->LogError(ss.str());
|
|
return Status(ss.str());
|
|
}
|
|
*record_ptr += size;
|
|
} else {
|
|
// Uncompressed records
|
|
RETURN_IF_FALSE(SerDeUtils::ReadVLong(context_, record_len, &parse_status_));
|
|
if (*record_len > context_->scan_range()->len() || *record_len < 0) {
|
|
stringstream ss;
|
|
ss << "Record length is: " << record_len;
|
|
if (state_->LogHasSpace()) state_->LogError(ss.str());
|
|
return Status(ss.str());
|
|
}
|
|
RETURN_IF_FALSE(
|
|
SerDeUtils::ReadBytes(context_, *record_len, record_ptr, &parse_status_));
|
|
}
|
|
return Status::OK;
|
|
}
|
|
|
|
// Process block compressed sequence files. This is the most used sequence file
|
|
// format. The general strategy is to process the data in large chunks to minimize
|
|
// function calls. The process is:
|
|
// 1. Decompress an entire block
|
|
// 2. In row batch sizes:
|
|
// a. Collect the start of records and their lengths
|
|
// b. Parse cols locations to field_locations_
|
|
// c. Materialize those field locations to row batches
|
|
// This mimics the technique for text.
|
|
// This function only returns on error or when the entire scan range is complete.
|
|
Status HdfsSequenceScanner::ProcessBlockCompressedScanRange() {
|
|
DCHECK(header_->is_compressed);
|
|
|
|
while (!context_->eosr() || num_buffered_records_in_compressed_block_ > 0) {
|
|
if (num_buffered_records_in_compressed_block_ == 0) {
|
|
if (context_->eosr()) return Status::OK;
|
|
// No more decompressed data, decompress the next block
|
|
RETURN_IF_ERROR(ReadCompressedBlock());
|
|
if (num_buffered_records_in_compressed_block_ < 0) return parse_status_;
|
|
}
|
|
|
|
MemPool* pool;
|
|
TupleRow* tuple_row;
|
|
int64_t max_tuples = context_->GetMemory(&pool, &tuple_, &tuple_row);
|
|
int num_to_commit = min(max_tuples, num_buffered_records_in_compressed_block_);
|
|
num_buffered_records_in_compressed_block_ -= num_to_commit;
|
|
|
|
if (scan_node_->materialized_slots().empty()) {
|
|
// Handle case where there are no slots to materialize (e.g. count(*))
|
|
num_to_commit = WriteEmptyTuples(context_, tuple_row, num_to_commit);
|
|
if (num_to_commit > 0) context_->CommitRows(num_to_commit);
|
|
continue;
|
|
}
|
|
|
|
// 2a. Parse record starts and lengths
|
|
int field_location_offset = 0;
|
|
for (int i = 0; i < num_to_commit; ++i) {
|
|
DCHECK_LT(i, record_locations_.size());
|
|
int bytes_read = SerDeUtils::GetVLong(
|
|
next_record_in_compressed_block_, &record_locations_[i].len);
|
|
if (UNLIKELY(bytes_read == -1)) {
|
|
stringstream ss;
|
|
ss << "Invalid record size in compressed block.";
|
|
if (state_->LogHasSpace()) state_->LogError(ss.str());
|
|
return Status(ss.str());
|
|
}
|
|
next_record_in_compressed_block_ += bytes_read;
|
|
record_locations_[i].record = next_record_in_compressed_block_;
|
|
next_record_in_compressed_block_ += record_locations_[i].len;
|
|
}
|
|
|
|
// 2b. Parse records to find field locations.
|
|
for (int i = 0; i < num_to_commit; ++i) {
|
|
int num_fields = 0;
|
|
if (delimited_text_parser_->escape_char() == '\0') {
|
|
delimited_text_parser_->ParseSingleTuple<false>(record_locations_[i].len,
|
|
reinterpret_cast<char*>(record_locations_[i].record),
|
|
&field_locations_[field_location_offset], &num_fields);
|
|
} else {
|
|
delimited_text_parser_->ParseSingleTuple<true>(record_locations_[i].len,
|
|
reinterpret_cast<char*>(record_locations_[i].record),
|
|
&field_locations_[field_location_offset], &num_fields);
|
|
}
|
|
DCHECK_EQ(num_fields, scan_node_->materialized_slots().size());
|
|
field_location_offset += num_fields;
|
|
DCHECK_LE(field_location_offset, field_locations_.size());
|
|
}
|
|
|
|
int max_added_tuples = (scan_node_->limit() == -1) ?
|
|
num_to_commit : scan_node_->limit() - scan_node_->rows_returned();
|
|
|
|
// Materialize parsed cols to tuples
|
|
SCOPED_TIMER(scan_node_->materialize_tuple_timer());
|
|
// Call jitted function if possible
|
|
int tuples_returned;
|
|
if (write_tuples_fn_ != NULL) {
|
|
// last argument: seq always starts at record_location[0]
|
|
tuples_returned = write_tuples_fn_(this, pool, tuple_row,
|
|
context_->row_byte_size(), &field_locations_[0], num_to_commit,
|
|
max_added_tuples, scan_node_->materialized_slots().size(), 0);
|
|
} else {
|
|
tuples_returned = WriteAlignedTuples(pool, tuple_row,
|
|
context_->row_byte_size(), &field_locations_[0], num_to_commit,
|
|
max_added_tuples, scan_node_->materialized_slots().size(), 0);
|
|
}
|
|
|
|
if (tuples_returned == -1) return parse_status_;
|
|
context_->CommitRows(tuples_returned);
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status HdfsSequenceScanner::ProcessRange() {
|
|
num_buffered_records_in_compressed_block_ = 0;
|
|
|
|
SeqFileHeader* seq_header = reinterpret_cast<SeqFileHeader*>(header_);
|
|
// Block compressed is handled separately to minimize function calls.
|
|
if (seq_header->is_compressed && !seq_header->is_row_compressed) {
|
|
return ProcessBlockCompressedScanRange();
|
|
}
|
|
|
|
// We count the time here since there is too much overhead to do
|
|
// this on each record.
|
|
SCOPED_TIMER(scan_node_->materialize_tuple_timer());
|
|
|
|
bool eosr = false;
|
|
while (!eosr) {
|
|
// Current record to process and its length.
|
|
uint8_t* record = NULL;
|
|
int64_t record_len = 0;
|
|
// Get the next compressed or uncompressed record.
|
|
RETURN_IF_ERROR(GetRecord(&record, &record_len, &eosr));
|
|
|
|
if (eosr) {
|
|
DCHECK(record == NULL);
|
|
break;
|
|
}
|
|
|
|
MemPool* pool;
|
|
TupleRow* tuple_row_mem;
|
|
int max_tuples = context_->GetMemory(&pool, &tuple_, &tuple_row_mem);
|
|
DCHECK_GT(max_tuples, 0);
|
|
|
|
// Parse the current record.
|
|
bool add_row = false;
|
|
|
|
// Parse the current record.
|
|
if (scan_node_->materialized_slots().size() != 0) {
|
|
char* col_start;
|
|
uint8_t* record_start = record;
|
|
int num_tuples = 0;
|
|
int num_fields = 0;
|
|
char* row_end_loc;
|
|
uint8_t error_in_row = false;
|
|
|
|
RETURN_IF_ERROR(delimited_text_parser_->ParseFieldLocations(
|
|
1, record_len, reinterpret_cast<char**>(&record), &row_end_loc,
|
|
&field_locations_[0], &num_tuples, &num_fields, &col_start));
|
|
DCHECK(num_tuples == 1);
|
|
|
|
uint8_t errors[num_fields];
|
|
memset(errors, 0, sizeof(errors));
|
|
|
|
add_row = WriteCompleteTuple(pool, &field_locations_[0], tuple_, tuple_row_mem,
|
|
template_tuple_, &errors[0], &error_in_row);
|
|
|
|
if (UNLIKELY(error_in_row)) {
|
|
for (int i = 0; i < scan_node_->materialized_slots().size(); ++i) {
|
|
if (errors[i]) {
|
|
const SlotDescriptor* desc = scan_node_->materialized_slots()[i];
|
|
ReportColumnParseError(
|
|
desc, field_locations_[i].start, field_locations_[i].len);
|
|
}
|
|
}
|
|
// Report all the fields that have errors.
|
|
++num_errors_in_file_;
|
|
if (state_->LogHasSpace()) {
|
|
stringstream ss;
|
|
ss << "file: " << context_->filename() << endl
|
|
<< "record: " << string(reinterpret_cast<char*>(record_start), record_len);
|
|
state_->LogError(ss.str());
|
|
}
|
|
if (state_->abort_on_error()) {
|
|
state_->ReportFileErrors(context_->filename(), 1);
|
|
return Status(state_->ErrorLog());
|
|
}
|
|
}
|
|
} else {
|
|
add_row = WriteEmptyTuples(context_, tuple_row_mem, 1);
|
|
}
|
|
|
|
if (add_row) context_->CommitRows(1);
|
|
if (scan_node_->ReachedLimit()) break;
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status HdfsSequenceScanner::ReadFileHeader() {
|
|
uint8_t* header;
|
|
|
|
RETURN_IF_FALSE(SerDeUtils::ReadBytes(context_,
|
|
sizeof(SEQFILE_VERSION_HEADER), &header, &parse_status_));
|
|
|
|
if (memcmp(header, SEQFILE_VERSION_HEADER, sizeof(SEQFILE_VERSION_HEADER))) {
|
|
stringstream ss;
|
|
ss << "Invalid SEQFILE_VERSION_HEADER: '"
|
|
<< SerDeUtils::HexDump(header, sizeof(SEQFILE_VERSION_HEADER)) << "'";
|
|
return Status(ss.str());
|
|
}
|
|
|
|
// We don't care what this is since we don't use the keys.
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
|
|
uint8_t* class_name;
|
|
int len;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadText(context_, &class_name, &len, &parse_status_));
|
|
if (memcmp(class_name, HdfsSequenceScanner::SEQFILE_VALUE_CLASS_NAME, len)) {
|
|
stringstream ss;
|
|
ss << "Invalid SEQFILE_VALUE_CLASS_NAME: '"
|
|
<< string(reinterpret_cast<char*>(class_name), len) << "'";
|
|
return Status(ss.str());
|
|
}
|
|
|
|
SeqFileHeader* seq_header = reinterpret_cast<SeqFileHeader*>(header_);
|
|
bool is_blk_compressed;
|
|
RETURN_IF_FALSE(
|
|
SerDeUtils::ReadBoolean(context_, &header_->is_compressed, &parse_status_));
|
|
RETURN_IF_FALSE(
|
|
SerDeUtils::ReadBoolean(context_, &is_blk_compressed, &parse_status_));
|
|
seq_header->is_row_compressed = !is_blk_compressed;
|
|
|
|
if (header_->is_compressed) {
|
|
uint8_t* codec_ptr;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadText(context_, &codec_ptr, &len, &parse_status_));
|
|
header_->codec = string(reinterpret_cast<char*>(codec_ptr), len);
|
|
Codec::CodecMap::const_iterator it = Codec::CODEC_MAP.find(header_->codec);
|
|
DCHECK(it != Codec::CODEC_MAP.end());
|
|
header_->compression_type = it->second;
|
|
} else {
|
|
header_->compression_type = THdfsCompression::NONE;
|
|
}
|
|
VLOG_FILE << context_->filename() << ": "
|
|
<< (header_->is_compressed ?
|
|
(seq_header->is_row_compressed ? "row compressed" : "block compressed") :
|
|
"not compressed");
|
|
if (header_->is_compressed) VLOG_FILE << header_->codec;
|
|
|
|
// Skip file metadata
|
|
int map_size = 0;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadInt(context_, &map_size, &parse_status_));
|
|
|
|
for (int i = 0; i < map_size; ++i) {
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
}
|
|
|
|
// Read file sync marker
|
|
uint8_t* sync;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadBytes(context_, SYNC_HASH_SIZE, &sync, &parse_status_));
|
|
memcpy(header_->sync, sync, SYNC_HASH_SIZE);
|
|
|
|
header_->header_size = context_->total_bytes_returned();
|
|
header_->file_type = THdfsFileFormat::SEQUENCE_FILE;
|
|
return Status::OK;
|
|
}
|
|
|
|
Status HdfsSequenceScanner::ReadBlockHeader(bool* sync) {
|
|
RETURN_IF_FALSE(SerDeUtils::ReadInt(context_, ¤t_block_length_, &parse_status_));
|
|
*sync = false;
|
|
if (current_block_length_ == HdfsSequenceScanner::SYNC_MARKER) {
|
|
RETURN_IF_ERROR(ReadSync());
|
|
RETURN_IF_FALSE(SerDeUtils::ReadInt(context_, ¤t_block_length_, &parse_status_));
|
|
*sync = true;
|
|
}
|
|
if (current_block_length_ < 0) {
|
|
stringstream ss;
|
|
int64_t position = context_->file_offset();
|
|
position -= sizeof(int32_t);
|
|
ss << "Bad block length: " << current_block_length_ << " at offset " << position;
|
|
return Status(ss.str());
|
|
}
|
|
|
|
RETURN_IF_FALSE(SerDeUtils::ReadInt(context_, ¤t_key_length_, &parse_status_));
|
|
if (current_key_length_ < 0) {
|
|
stringstream ss;
|
|
int64_t position = context_->file_offset();
|
|
position -= sizeof(int32_t);
|
|
ss << "Bad key length: " << current_key_length_ << " at offset " << position;
|
|
return Status(ss.str());
|
|
}
|
|
|
|
return Status::OK;
|
|
}
|
|
|
|
Status HdfsSequenceScanner::ReadCompressedBlock() {
|
|
// We are reading a new compressed block. Pass the previous buffer pool
|
|
// bytes to the batch. We don't need them anymore.
|
|
if (!context_->compact_data()) {
|
|
context_->AcquirePool(data_buffer_pool_.get());
|
|
}
|
|
|
|
block_start_ = context_->file_offset();
|
|
if (have_sync_) {
|
|
// We skipped ahead on an error and read the sync block.
|
|
have_sync_ = false;
|
|
} else {
|
|
// Read the sync indicator and check the sync block.
|
|
int sync_indicator;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadInt(context_, &sync_indicator, &parse_status_));
|
|
if (sync_indicator != -1) {
|
|
if (state_->LogHasSpace()) {
|
|
stringstream ss;
|
|
ss << "Expecting sync indicator (-1) at file offset "
|
|
<< (context_->file_offset() - sizeof(int)) << ". "
|
|
<< "Sync indicator found " << sync_indicator << ".";
|
|
state_->LogError(ss.str());
|
|
}
|
|
return Status("Bad sync hash");
|
|
}
|
|
RETURN_IF_ERROR(ReadSync());
|
|
}
|
|
|
|
RETURN_IF_FALSE(SerDeUtils::ReadVLong(context_,
|
|
&num_buffered_records_in_compressed_block_, &parse_status_));
|
|
if (num_buffered_records_in_compressed_block_ < 0) {
|
|
if (state_->LogHasSpace()) {
|
|
stringstream ss;
|
|
ss << "Bad compressed block record count: "
|
|
<< num_buffered_records_in_compressed_block_;
|
|
state_->LogError(ss.str());
|
|
}
|
|
return Status("bad record count");
|
|
}
|
|
|
|
// Skip the compressed key length and key buffers, we don't need them.
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
|
|
// Skip the compressed value length buffer. We don't need these either since the
|
|
// records are in Text format with length included.
|
|
RETURN_IF_FALSE(SerDeUtils::SkipText(context_, &parse_status_));
|
|
|
|
// Read the compressed value buffer from the unbuffered stream.
|
|
int block_size = 0;
|
|
RETURN_IF_FALSE(SerDeUtils::ReadVInt(context_, &block_size, &parse_status_));
|
|
// Check for a reasonable size
|
|
if (block_size > MAX_BLOCK_SIZE || block_size < 0) {
|
|
stringstream ss;
|
|
ss << "Compressed block size is: " << block_size;
|
|
if (state_->LogHasSpace()) state_->LogError(ss.str());
|
|
return Status(ss.str());
|
|
}
|
|
|
|
uint8_t* compressed_data = NULL;
|
|
RETURN_IF_FALSE(
|
|
SerDeUtils::ReadBytes(context_, block_size, &compressed_data, &parse_status_));
|
|
|
|
int len = 0;
|
|
SCOPED_TIMER(decompress_timer_);
|
|
RETURN_IF_ERROR(decompressor_->ProcessBlock(block_size, compressed_data,
|
|
&len, &unparsed_data_buffer_));
|
|
next_record_in_compressed_block_ = unparsed_data_buffer_;
|
|
return Status::OK;
|
|
}
|
|
|
|
void HdfsSequenceScanner::LogRowParseError(stringstream* ss, int row_idx) {
|
|
DCHECK(state_->LogHasSpace());
|
|
DCHECK_LT(row_idx, record_locations_.size());
|
|
*ss << string(reinterpret_cast<const char*>(record_locations_[row_idx].record),
|
|
record_locations_[row_idx].len);
|
|
}
|
|
|