mirror of
https://github.com/apache/impala.git
synced 2025-12-19 18:12:08 -05:00
For files that have a Cloudera copyright (and no other copyright notice), make changes to follow the ASF source file header policy here: http://www.apache.org/legal/src-headers.html#headers Specifically: 1) Remove the Cloudera copyright. 2) Modify NOTICE.txt according to http://www.apache.org/legal/src-headers.html#notice to follow that format and add a line for Cloudera. 3) Replace or add the existing ASF license text with the one given on the website. Much of this change was automatically generated via: git grep -li 'Copyright.*Cloudera' > modified_files.txt cat modified_files.txt | xargs perl -n -i -e 'print unless m#Copyright.*Cloudera#i;' cat modified_files_txt | xargs fix_apache_license.py [1] Some manual fixups were performed following those steps, especially when license text was completely missing from the file. [1] https://gist.github.com/anonymous/ff71292094362fc5c594 with minor modification to ORIG_LICENSE to match Impala's license text. Change-Id: I2e0bd8420945b953e1b806041bea4d72a3943d86 Reviewed-on: http://gerrit.cloudera.org:8080/3779 Reviewed-by: Dan Hecht <dhecht@cloudera.com> Tested-by: Internal Jenkins
325 lines
12 KiB
C++
325 lines
12 KiB
C++
// Licensed to the Apache Software Foundation (ASF) under one
|
|
// or more contributor license agreements. See the NOTICE file
|
|
// distributed with this work for additional information
|
|
// regarding copyright ownership. The ASF licenses this file
|
|
// to you under the Apache License, Version 2.0 (the
|
|
// "License"); you may not use this file except in compliance
|
|
// with the License. You may obtain a copy of the License at
|
|
//
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
//
|
|
// Unless required by applicable law or agreed to in writing,
|
|
// software distributed under the License is distributed on an
|
|
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
|
// KIND, either express or implied. See the License for the
|
|
// specific language governing permissions and limitations
|
|
// under the License.
|
|
|
|
#include <boost/bind.hpp>
|
|
|
|
#include "exec/base-sequence-scanner.h"
|
|
|
|
#include "exec/hdfs-scan-node.h"
|
|
#include "exec/scanner-context.inline.h"
|
|
#include "runtime/runtime-state.h"
|
|
#include "runtime/string-search.h"
|
|
#include "util/codec.h"
|
|
#include "util/runtime-profile-counters.h"
|
|
#include "util/test-info.h"
|
|
|
|
#include "common/names.h"
|
|
|
|
using namespace impala;
|
|
|
|
const int BaseSequenceScanner::HEADER_SIZE = 1024;
|
|
const int BaseSequenceScanner::SYNC_MARKER = -1;
|
|
|
|
// Constants used in ReadPastSize()
|
|
static const double BLOCK_SIZE_PADDING_PERCENT = 0.1;
|
|
static const int MIN_SYNC_READ_SIZE = 64 * 1024; // bytes
|
|
|
|
// Macro to convert between SerdeUtil errors to Status returns.
|
|
#define RETURN_IF_FALSE(x) if (UNLIKELY(!(x))) return parse_status_
|
|
|
|
Status BaseSequenceScanner::IssueInitialRanges(HdfsScanNode* scan_node,
|
|
const vector<HdfsFileDesc*>& files) {
|
|
// Issue just the header range for each file. When the header is complete,
|
|
// we'll issue the splits for that file. Splits cannot be processed until the
|
|
// header is parsed (the header object is then shared across splits for that file).
|
|
vector<DiskIoMgr::ScanRange*> header_ranges;
|
|
for (int i = 0; i < files.size(); ++i) {
|
|
ScanRangeMetadata* metadata =
|
|
reinterpret_cast<ScanRangeMetadata*>(files[i]->splits[0]->meta_data());
|
|
int64_t header_size = min<int64_t>(HEADER_SIZE, files[i]->file_length);
|
|
// The header is almost always a remote read. Set the disk id to -1 and indicate
|
|
// it is not cached.
|
|
// TODO: add remote disk id and plumb that through to the io mgr. It should have
|
|
// 1 queue for each NIC as well?
|
|
DiskIoMgr::ScanRange* header_range = scan_node->AllocateScanRange(
|
|
files[i]->fs, files[i]->filename.c_str(), header_size, 0, metadata->partition_id,
|
|
-1, false, false, files[i]->mtime);
|
|
header_ranges.push_back(header_range);
|
|
}
|
|
// Issue the header ranges only. ProcessSplit() will issue the files' scan ranges
|
|
// and those ranges will need scanner threads, so no files are marked completed yet.
|
|
RETURN_IF_ERROR(scan_node->AddDiskIoRanges(header_ranges, 0));
|
|
return Status::OK();
|
|
}
|
|
|
|
BaseSequenceScanner::BaseSequenceScanner(HdfsScanNode* node, RuntimeState* state,
|
|
bool add_batches_to_queue)
|
|
: HdfsScanner(node, state, add_batches_to_queue),
|
|
header_(NULL),
|
|
block_start_(0),
|
|
total_block_size_(0),
|
|
num_syncs_(0) {
|
|
}
|
|
|
|
BaseSequenceScanner::BaseSequenceScanner()
|
|
: HdfsScanner(),
|
|
header_(NULL),
|
|
block_start_(0),
|
|
total_block_size_(0),
|
|
num_syncs_(0) {
|
|
DCHECK(TestInfo::is_test());
|
|
}
|
|
|
|
BaseSequenceScanner::~BaseSequenceScanner() {
|
|
}
|
|
|
|
Status BaseSequenceScanner::Open(ScannerContext* context) {
|
|
RETURN_IF_ERROR(HdfsScanner::Open(context));
|
|
stream_->set_read_past_size_cb(bind(&BaseSequenceScanner::ReadPastSize, this, _1));
|
|
bytes_skipped_counter_ = ADD_COUNTER(
|
|
scan_node_->runtime_profile(), "BytesSkipped", TUnit::BYTES);
|
|
// Allocate a new row batch. May fail if mem limit is exceeded.
|
|
RETURN_IF_ERROR(StartNewRowBatch());
|
|
return Status::OK();
|
|
}
|
|
|
|
void BaseSequenceScanner::Close(RowBatch* row_batch) {
|
|
VLOG_FILE << "Bytes read past scan range: " << -stream_->bytes_left();
|
|
VLOG_FILE << "Average block size: "
|
|
<< (num_syncs_ > 1 ? total_block_size_ / (num_syncs_ - 1) : 0);
|
|
// Need to close the decompressor before releasing the resources at AddFinalRowBatch(),
|
|
// because in some cases there is memory allocated in decompressor_'s temp_memory_pool_.
|
|
if (decompressor_.get() != NULL) {
|
|
decompressor_->Close();
|
|
decompressor_.reset(NULL);
|
|
}
|
|
if (row_batch != NULL) {
|
|
row_batch->tuple_data_pool()->AcquireData(data_buffer_pool_.get(), false);
|
|
context_->ReleaseCompletedResources(row_batch, true);
|
|
if (add_batches_to_queue_) scan_node_->AddMaterializedRowBatch(row_batch);
|
|
}
|
|
// Verify all resources (if any) have been transferred.
|
|
DCHECK_EQ(data_buffer_pool_.get()->total_allocated_bytes(), 0);
|
|
DCHECK_EQ(context_->num_completed_io_buffers(), 0);
|
|
// 'header_' can be NULL if HdfsScanNode::CreateAndPrepareScanner() failed.
|
|
if (!only_parsing_header_ && header_ != NULL) {
|
|
scan_node_->RangeComplete(file_format(), header_->compression_type);
|
|
}
|
|
HdfsScanner::Close(row_batch);
|
|
}
|
|
|
|
Status BaseSequenceScanner::ProcessSplit() {
|
|
DCHECK(add_batches_to_queue_);
|
|
header_ = reinterpret_cast<FileHeader*>(
|
|
scan_node_->GetFileMetadata(stream_->filename()));
|
|
if (header_ == NULL) {
|
|
// This is the initial scan range just to parse the header
|
|
only_parsing_header_ = true;
|
|
header_ = state_->obj_pool()->Add(AllocateFileHeader());
|
|
Status status = ReadFileHeader();
|
|
if (!status.ok()) {
|
|
RETURN_IF_ERROR(state_->LogOrReturnError(status.msg()));
|
|
// We need to complete the ranges for this file.
|
|
CloseFileRanges(stream_->filename());
|
|
return Status::OK();
|
|
}
|
|
|
|
// Header is parsed, set the metadata in the scan node and issue more ranges
|
|
scan_node_->SetFileMetadata(stream_->filename(), header_);
|
|
HdfsFileDesc* desc = scan_node_->GetFileDesc(stream_->filename());
|
|
scan_node_->AddDiskIoRanges(desc);
|
|
return Status::OK();
|
|
}
|
|
|
|
// Initialize state for new scan range
|
|
finished_ = false;
|
|
// If the file is compressed, the buffers in the stream_ are not used directly.
|
|
if (header_->is_compressed) stream_->set_contains_tuple_data(false);
|
|
RETURN_IF_ERROR(InitNewRange());
|
|
|
|
Status status = Status::OK();
|
|
|
|
// Skip to the first record
|
|
if (stream_->file_offset() < header_->header_size) {
|
|
// If the scan range starts within the header, skip to the end of the header so we
|
|
// don't accidentally skip to an extra sync within the header
|
|
RETURN_IF_FALSE(stream_->SkipBytes(
|
|
header_->header_size - stream_->file_offset(), &parse_status_));
|
|
}
|
|
RETURN_IF_ERROR(SkipToSync(header_->sync, SYNC_HASH_SIZE));
|
|
|
|
// Process Range.
|
|
while (!finished_) {
|
|
status = ProcessRange();
|
|
if (status.ok()) break;
|
|
if (status.IsCancelled() || status.IsMemLimitExceeded()) return status;
|
|
|
|
// Log error from file format parsing.
|
|
state_->LogError(ErrorMsg(TErrorCode::SEQUENCE_SCANNER_PARSE_ERROR,
|
|
stream_->filename(), stream_->file_offset(),
|
|
(stream_->eof() ? "(EOF)" : "")));
|
|
|
|
// Make sure errors specified in the status are logged as well
|
|
state_->LogError(status.msg());
|
|
|
|
// If abort on error then return, otherwise try to recover.
|
|
if (state_->abort_on_error()) return status;
|
|
|
|
// Recover by skipping to the next sync.
|
|
parse_status_ = Status::OK();
|
|
int64_t error_offset = stream_->file_offset();
|
|
status = SkipToSync(header_->sync, SYNC_HASH_SIZE);
|
|
COUNTER_ADD(bytes_skipped_counter_, stream_->file_offset() - error_offset);
|
|
RETURN_IF_ERROR(status);
|
|
DCHECK(parse_status_.ok());
|
|
}
|
|
|
|
// All done with this scan range.
|
|
return Status::OK();
|
|
}
|
|
|
|
Status BaseSequenceScanner::ReadSync() {
|
|
// We are finished when we read a sync marker occurring completely in the next
|
|
// scan range
|
|
finished_ = stream_->eosr();
|
|
|
|
uint8_t* hash;
|
|
int64_t out_len;
|
|
RETURN_IF_FALSE(stream_->GetBytes(SYNC_HASH_SIZE, &hash, &out_len, &parse_status_));
|
|
if (out_len != SYNC_HASH_SIZE || memcmp(hash, header_->sync, SYNC_HASH_SIZE)) {
|
|
stringstream ss;
|
|
ss << "Bad synchronization marker" << endl
|
|
<< " Expected: '"
|
|
<< ReadWriteUtil::HexDump(header_->sync, SYNC_HASH_SIZE) << "'" << endl
|
|
<< " Actual: '"
|
|
<< ReadWriteUtil::HexDump(hash, SYNC_HASH_SIZE) << "'";
|
|
return Status(ss.str());
|
|
}
|
|
finished_ |= stream_->eof();
|
|
total_block_size_ += stream_->file_offset() - block_start_;
|
|
block_start_ = stream_->file_offset();
|
|
++num_syncs_;
|
|
return Status::OK();
|
|
}
|
|
|
|
int BaseSequenceScanner::FindSyncBlock(const uint8_t* buffer, int buffer_len,
|
|
const uint8_t* sync, int sync_len) {
|
|
char* sync_str = reinterpret_cast<char*>(const_cast<uint8_t*>(sync));
|
|
StringValue needle = StringValue(sync_str, sync_len);
|
|
|
|
StringValue haystack(
|
|
const_cast<char*>(reinterpret_cast<const char*>(buffer)), buffer_len);
|
|
|
|
StringSearch search(&needle);
|
|
int offset = search.Search(&haystack);
|
|
|
|
if (offset != -1) {
|
|
// Advance offset past sync
|
|
offset += sync_len;
|
|
}
|
|
return offset;
|
|
}
|
|
|
|
Status BaseSequenceScanner::SkipToSync(const uint8_t* sync, int sync_size) {
|
|
// offset into current buffer of end of sync (once found, -1 until then)
|
|
int offset = -1;
|
|
uint8_t* buffer;
|
|
int64_t buffer_len;
|
|
Status status;
|
|
|
|
// Read buffers until we find a sync or reach the end of the scan range. If we read all
|
|
// the buffers remaining in the scan range and none of them contain a sync (including a
|
|
// sync that starts at the end of this scan range and continues into the next one), then
|
|
// there are no more syncs in this scan range and we're finished.
|
|
while (!stream_->eosr()) {
|
|
// Check if there's a sync fully contained in the current buffer
|
|
RETURN_IF_ERROR(stream_->GetBuffer(true, &buffer, &buffer_len));
|
|
offset = FindSyncBlock(buffer, buffer_len, sync, sync_size);
|
|
DCHECK_LE(offset, buffer_len);
|
|
if (offset != -1) break;
|
|
|
|
// No sync found in the current buffer, so check if there's a sync spanning the
|
|
// current buffer and the next. First we skip so there are sync_size - 1 bytes left,
|
|
// then we read these bytes plus the first sync_size - 1 bytes of the next buffer.
|
|
// This guarantees that we find any syncs that start in the current buffer and end in
|
|
// the next buffer.
|
|
int64_t to_skip = max<int64_t>(0, buffer_len - (sync_size - 1));
|
|
RETURN_IF_FALSE(stream_->SkipBytes(to_skip, &parse_status_));
|
|
// Peek so we don't advance stream_ into the next buffer. If we don't find a sync here
|
|
// then we'll need to check all of the next buffer, including the first sync_size -1
|
|
// bytes.
|
|
RETURN_IF_FALSE(stream_->GetBytes(
|
|
(sync_size - 1) * 2, &buffer, &buffer_len, &parse_status_, true));
|
|
offset = FindSyncBlock(buffer, buffer_len, sync, sync_size);
|
|
DCHECK_LE(offset, buffer_len);
|
|
if (offset != -1) break;
|
|
|
|
// No sync starting in this buffer, so advance stream_ to the beginning of the next
|
|
// buffer.
|
|
RETURN_IF_ERROR(stream_->GetBuffer(false, &buffer, &buffer_len));
|
|
}
|
|
|
|
if (offset == -1) {
|
|
// No more syncs in this scan range
|
|
DCHECK(stream_->eosr());
|
|
finished_ = true;
|
|
return Status::OK();
|
|
}
|
|
DCHECK_GE(offset, sync_size);
|
|
|
|
// Make sure sync starts in our scan range
|
|
if (offset - sync_size >= stream_->bytes_left()) {
|
|
finished_ = true;
|
|
return Status::OK();
|
|
}
|
|
|
|
RETURN_IF_FALSE(stream_->SkipBytes(offset, &parse_status_));
|
|
VLOG_FILE << "Found sync for: " << stream_->filename()
|
|
<< " at " << stream_->file_offset() - sync_size;
|
|
if (stream_->eof()) finished_ = true;
|
|
block_start_ = stream_->file_offset();
|
|
++num_syncs_;
|
|
return Status::OK();
|
|
}
|
|
|
|
void BaseSequenceScanner::CloseFileRanges(const char* filename) {
|
|
DCHECK(only_parsing_header_);
|
|
HdfsFileDesc* desc = scan_node_->GetFileDesc(filename);
|
|
const vector<DiskIoMgr::ScanRange*>& splits = desc->splits;
|
|
for (int i = 0; i < splits.size(); ++i) {
|
|
COUNTER_ADD(bytes_skipped_counter_, splits[i]->len());
|
|
scan_node_->RangeComplete(file_format(), THdfsCompression::NONE);
|
|
}
|
|
}
|
|
|
|
int BaseSequenceScanner::ReadPastSize(int64_t file_offset) {
|
|
DCHECK_GE(total_block_size_, 0);
|
|
// This scan range didn't include a complete block, so we have no idea how many bytes
|
|
// remain in the block. Let ScannerContext use its default strategy.
|
|
if (total_block_size_ == 0) return 0;
|
|
DCHECK_GE(num_syncs_, 2);
|
|
int average_block_size = total_block_size_ / (num_syncs_ - 1);
|
|
|
|
// Number of bytes read in the current block
|
|
int block_bytes_read = file_offset - block_start_;
|
|
DCHECK_GE(block_bytes_read, 0);
|
|
int bytes_left = max(average_block_size - block_bytes_read, 0);
|
|
// Include some padding
|
|
bytes_left += average_block_size * BLOCK_SIZE_PADDING_PERCENT;
|
|
return max(bytes_left, MIN_SYNC_READ_SIZE);
|
|
}
|