// Copyright 2012 Cloudera Inc. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef IMPALA_EXEC_BASE_SEQUENCE_SCANNER_H #define IMPALA_EXEC_BASE_SEQUENCE_SCANNER_H #include #include #include #include "exec/hdfs-scanner.h" namespace impala { class Codec; struct HdfsFileDesc; class ScanRangeContext; // Superclass for all sequence container based file formats: // e.g. SequenceFile, RCFile, Avro // Sequence container formats have sync markers periodically in the file. // This class is will skip to the start of sync markers for errors and // hdfs splits. class BaseSequenceScanner : public HdfsScanner { public: // Issue the initial ranges for all sequence container files. static void IssueInitialRanges(HdfsScanNode*, const std::vector&); virtual Status Prepare(); virtual Status Close(); virtual Status ProcessScanRange(ScanRangeContext* context); virtual ~BaseSequenceScanner(); protected: // Size of the sync hash field. const static int SYNC_HASH_SIZE = 16; // Data that is shared between scan ranges of the same file. The subclass is // responsible for filling in all these fields in ReadFileHeader struct FileHeader { // Type of file: e.g. rcfile, seqfile THdfsFileFormat::type file_type; // The sync hash for this file. uint8_t sync[SYNC_HASH_SIZE]; // true if the file is compressed bool is_compressed; // Codec name if it is compressed std::string codec; // Enum for compression type. THdfsCompression::type compression_type; // Byte size of header int64_t header_size; }; // Subclasses must implement these functions. The order for calls will be // 1. AllocateFileHeader() - called once per file // 2. InitNewRange() // 3. ReadFileHeader() // 4. ProcessRange() // In the normal case, 2-4 is called for each range once. In the case of // errors and skipped bytes, 4 is repeatedly called, each time starting // right after the sync marker. // Allocate a file header object for this scanner. If the scanner needs // additional header information, it should subclass FileHeader. // The allocated object will be placed in the scan node's pool. virtual FileHeader* AllocateFileHeader() = 0; // Reset internal state for a new scan range. virtual Status InitNewRange() = 0; // Read the file header. The underlying ScanRangeContext is at the start of // the file header. This function must read the file header (which advances // context_ past it) and initialize header_. virtual Status ReadFileHeader() = 0; // Process the current range until the end or an error occurred. Note this might // be called multiple times if we skip over bad data. // This function should read from the underlying ScanRangeContext materializing // tuples to the context. When this function is called, it is guaranteed to be // at the start of a data block (i.e. right after the sync marker). virtual Status ProcessRange() = 0; BaseSequenceScanner(HdfsScanNode*, RuntimeState*); // Read and validate sync marker against header_->sync. Returns non-ok if the // sync marker did not match. Status ReadSync(); // Utility function to advance to the next sync marker, reading bytes from context_. // - sync: sync marker (does not include 0xFFFFFFFF prefix) // - sync_size: number of bytes for sync // - sync_found: returns if the sync marker was found before the end of the scan range Status SkipToSync( const uint8_t* sync, int sync_size, bool* sync_found); // Estimate of header size in bytes. This is initial number of bytes to issue // per file. If the estimate is too low, more bytes will be read as necessary. const static int HEADER_SIZE; // Sync indicator. const static int SYNC_MARKER; // File header for this scan range. This is not owned by the parent scan node. FileHeader* header_; // If true, this scanner object is only for processing the header. bool only_parsing_header_; // If we skip ahead on error and read the sync block this is set to true // so we do not need to look for it in ProcessRange bool have_sync_; // Byte offset from start of file for current block. Used for error reporting. int block_start_; // Decompressor class to use, if any. boost::scoped_ptr decompressor_; // Pool to allocate per data block memory. This should be used with the // decompressor and any other per data block allocations. boost::scoped_ptr data_buffer_pool_; // Time spent decompressing bytes RuntimeProfile::Counter* decompress_timer_; }; } #endif