# Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, # software distributed under the License is distributed on an # "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. from __future__ import absolute_import, division, print_function from datetime import date, datetime, time, timedelta from decimal import Decimal from functools import reduce import os import struct from subprocess import check_call import sys from builtins import map from thrift.protocol import TCompactProtocol from thrift.transport import TTransport from impala_thrift_gen.parquet.ttypes import FileMetaData, Type PARQUET_VERSION_NUMBER = b'PAR1' def create_protocol(serialized_object_buffer): """Creates a thrift protocol object from a memory buffer. The buffer should contain a serialized thrift object. """ transport = TTransport.TMemoryBuffer(serialized_object_buffer) return TCompactProtocol.TCompactProtocol(transport) def julian_day_to_date(julian_day): """Converts a julian day number into a Gregorian date. The reference date is picked arbitrarily and can be validated with an online converter like http://aa.usno.navy.mil/jdconverter?ID=AA&jd=2457755 """ return date(2017, 1, 1) + timedelta(julian_day - 2457755) def nanos_to_time(nanos): """Converts nanoseconds to time of day.""" micros = nanos // 1000 # integer division seconds, micros = divmod(micros, 10**6) minutes, seconds = divmod(seconds, 60) hours, minutes = divmod(minutes, 60) return time(hours, minutes, seconds, micros) def parse_boolean(s): """Parses a single boolean value from a single byte""" return struct.unpack(' 0 assert schema.type_length == len(value) assert schema.type == Type.FIXED_LEN_BYTE_ARRAY if sys.version_info.major < 3: byte_values = list(map(ord, value)) else: byte_values = list(value) numeric = Decimal(reduce(lambda x, y: x * 256 + y, byte_values)) # Compute two's complement for negative values. if (byte_values[0] > 127): bit_width = 8 * len(value) numeric = numeric - (2 ** bit_width) return numeric / 10 ** schema.scale def decode_stats_value(schema, value): """Decodes 'value' according to 'schema. It expects 'value' to be plain encoded. For BOOLEAN values, only the least significant bit is parsed and returned. Binary arrays are expected to be stored as such, without a preceding length. """ column_type = schema.type if column_type == Type.BOOLEAN: return parse_boolean(value) elif column_type == Type.INT32: return parse_int32(value) elif column_type == Type.INT64: return parse_int64(value) elif column_type == Type.INT96: # Impala uses INT96 to store timestamps return decode_timestamp(value) elif column_type == Type.FLOAT: return parse_float(value) elif column_type == Type.DOUBLE: return parse_double(value) elif column_type == Type.BYTE_ARRAY: # In parquet::Statistics, strings are stored as is. return value elif column_type == Type.FIXED_LEN_BYTE_ARRAY: return decode_decimal(schema, value) assert False return None def read_serialized_object(thrift_class, file, file_pos, length): """Reads an instance of class 'thrift_class' from an already opened file at the given position. """ file.seek(file_pos) serialized_thrift_object = file.read(length) protocol = create_protocol(serialized_thrift_object) thrift_object = thrift_class() thrift_object.read(protocol) return thrift_object def get_parquet_metadata(filename): """Returns a FileMetaData as defined in parquet.thrift. 'filename' must be a local file path. """ file_size = os.path.getsize(filename) with open(filename, 'rb') as f: # Check file starts and ends with magic bytes start_magic = f.read(len(PARQUET_VERSION_NUMBER)) assert start_magic == PARQUET_VERSION_NUMBER f.seek(file_size - len(PARQUET_VERSION_NUMBER)) end_magic = f.read(len(PARQUET_VERSION_NUMBER)) assert end_magic == PARQUET_VERSION_NUMBER # Read metadata length f.seek(file_size - len(PARQUET_VERSION_NUMBER) - 4) metadata_len = parse_int32(f.read(4)) # Calculate metadata position in file metadata_pos = file_size - len(PARQUET_VERSION_NUMBER) - 4 - metadata_len # Return deserialized FileMetaData object return read_serialized_object(FileMetaData, f, metadata_pos, metadata_len) def get_parquet_metadata_from_hdfs_folder(hdfs_path, tmp_dir): """Returns a list with the FileMetaData of every file in 'hdfs_path' and its subdirectories. The hdfs folder is copied into 'tmp_dir' before processing. """ check_call(['hdfs', 'dfs', '-get', hdfs_path, tmp_dir]) # Only walk the new directory to make the same tmp_dir usable for multiple tables. table_dir = os.path.join(tmp_dir, os.path.basename(os.path.normpath(hdfs_path))) result = [] for root, subdirs, files in os.walk(table_dir): for f in files: if not f.endswith('parq'): continue parquet_file = os.path.join(root, str(f)) file_meta_data = get_parquet_metadata(parquet_file) result.append(file_meta_data) return result