Files
impala/be/src/exec/base-sequence-scanner.h

149 lines
5.0 KiB
C++

// Copyright 2012 Cloudera Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef IMPALA_EXEC_BASE_SEQUENCE_SCANNER_H
#define IMPALA_EXEC_BASE_SEQUENCE_SCANNER_H
#include <vector>
#include <memory>
#include <stdint.h>
#include "exec/hdfs-scanner.h"
namespace impala {
class Codec;
struct HdfsFileDesc;
class ScanRangeContext;
// Superclass for all sequence container based file formats:
// e.g. SequenceFile, RCFile, Avro
// Sequence container formats have sync markers periodically in the file.
// This class is will skip to the start of sync markers for errors and
// hdfs splits.
class BaseSequenceScanner : public HdfsScanner {
public:
// Issue the initial ranges for all sequence container files.
static void IssueInitialRanges(HdfsScanNode*, const std::vector<HdfsFileDesc*>&);
virtual Status Prepare();
virtual Status Close();
virtual Status ProcessScanRange(ScanRangeContext* context);
virtual ~BaseSequenceScanner();
protected:
// Size of the sync hash field.
const static int SYNC_HASH_SIZE = 16;
// Data that is shared between scan ranges of the same file. The subclass is
// responsible for filling in all these fields in ReadFileHeader
struct FileHeader {
// Type of file: e.g. rcfile, seqfile
THdfsFileFormat::type file_type;
// The sync hash for this file.
uint8_t sync[SYNC_HASH_SIZE];
// true if the file is compressed
bool is_compressed;
// Codec name if it is compressed
std::string codec;
// Enum for compression type.
THdfsCompression::type compression_type;
// Byte size of header
int64_t header_size;
};
// Subclasses must implement these functions. The order for calls will be
// 1. AllocateFileHeader() - called once per file
// 2. InitNewRange()
// 3. ReadFileHeader()
// 4. ProcessRange()
// In the normal case, 2-4 is called for each range once. In the case of
// errors and skipped bytes, 4 is repeatedly called, each time starting
// right after the sync marker.
// Allocate a file header object for this scanner. If the scanner needs
// additional header information, it should subclass FileHeader.
// The allocated object will be placed in the scan node's pool.
virtual FileHeader* AllocateFileHeader() = 0;
// Reset internal state for a new scan range.
virtual Status InitNewRange() = 0;
// Read the file header. The underlying ScanRangeContext is at the start of
// the file header. This function must read the file header (which advances
// context_ past it) and initialize header_.
virtual Status ReadFileHeader() = 0;
// Process the current range until the end or an error occurred. Note this might
// be called multiple times if we skip over bad data.
// This function should read from the underlying ScanRangeContext materializing
// tuples to the context. When this function is called, it is guaranteed to be
// at the start of a data block (i.e. right after the sync marker).
virtual Status ProcessRange() = 0;
BaseSequenceScanner(HdfsScanNode*, RuntimeState*);
// Read and validate sync marker against header_->sync. Returns non-ok if the
// sync marker did not match.
Status ReadSync();
// Utility function to advance to the next sync marker, reading bytes from context_.
// - sync: sync marker (does not include 0xFFFFFFFF prefix)
// - sync_size: number of bytes for sync
// - sync_found: returns if the sync marker was found before the end of the scan range
Status SkipToSync(
const uint8_t* sync, int sync_size, bool* sync_found);
// Estimate of header size in bytes. This is initial number of bytes to issue
// per file. If the estimate is too low, more bytes will be read as necessary.
const static int HEADER_SIZE;
// Sync indicator.
const static int SYNC_MARKER;
// File header for this scan range. This is not owned by the parent scan node.
FileHeader* header_;
// If true, this scanner object is only for processing the header.
bool only_parsing_header_;
// If we skip ahead on error and read the sync block this is set to true
// so we do not need to look for it in ProcessRange
bool have_sync_;
// Byte offset from start of file for current block. Used for error reporting.
int block_start_;
// Decompressor class to use, if any.
boost::scoped_ptr<Codec> decompressor_;
// Pool to allocate per data block memory. This should be used with the
// decompressor and any other per data block allocations.
boost::scoped_ptr<MemPool> data_buffer_pool_;
// Time spent decompressing bytes
RuntimeProfile::Counter* decompress_timer_;
};
}
#endif