From 2ebf554dfdb0dc9055ef95c8f2ec4fad51f1e657 Mon Sep 17 00:00:00 2001 From: Qifan Chen Date: Wed, 12 Aug 2020 16:33:51 -0400 Subject: [PATCH] IMPALA-7779 Parquet Scanner can write binary data into profile This fix addresses the current limitation in that an ill-formatted Parquet version string is not properly formatted before appearing in an error message or impalad.INFO. With the fix, any such string is converted to a hex string first. The hex string is a sequence of four hex digit groups separated by spaces and each group is one or two hex digits, such as "6c 65 2e a". Testing: Ran "core" tests successfully. Change-Id: I281d6fa7cb2f88f04588110943e3e768678b9cf1 Reviewed-on: http://gerrit.cloudera.org:8080/16331 Tested-by: Impala Public Jenkins Reviewed-by: Sahil Takiar --- be/src/exec/parquet/hdfs-parquet-scanner.cc | 3 ++- common/thrift/generate_error_codes.py | 2 +- .../workloads/functional-query/queries/QueryTest/parquet.test | 2 +- 3 files changed, 4 insertions(+), 3 deletions(-) diff --git a/be/src/exec/parquet/hdfs-parquet-scanner.cc b/be/src/exec/parquet/hdfs-parquet-scanner.cc index d06539641..dd81e8c10 100644 --- a/be/src/exec/parquet/hdfs-parquet-scanner.cc +++ b/be/src/exec/parquet/hdfs-parquet-scanner.cc @@ -1331,8 +1331,9 @@ Status HdfsParquetScanner::ProcessFooter() { uint8_t* magic_number_ptr = buffer + scan_range_len - sizeof(PARQUET_VERSION_NUMBER); if (memcmp(magic_number_ptr, PARQUET_VERSION_NUMBER, sizeof(PARQUET_VERSION_NUMBER)) != 0) { + // Report the ill-formatted Parquet version string in hex. return Status(TErrorCode::PARQUET_BAD_VERSION_NUMBER, filename(), - string(reinterpret_cast(magic_number_ptr), sizeof(PARQUET_VERSION_NUMBER)), + ReadWriteUtil::HexDump(magic_number_ptr, sizeof(PARQUET_VERSION_NUMBER)), scan_node_->hdfs_table()->fully_qualified_name()); } diff --git a/common/thrift/generate_error_codes.py b/common/thrift/generate_error_codes.py index 39983a3c2..390b2ce5e 100755 --- a/common/thrift/generate_error_codes.py +++ b/common/thrift/generate_error_codes.py @@ -193,7 +193,7 @@ error_codes = ( "Try running \\\"refresh $1\\\" to reload the file metadata."), ("PARQUET_BAD_VERSION_NUMBER", 60, "File '$0' has an invalid Parquet version number: " - "$1\\n. Please check that it is a valid Parquet file. " + "$1.\\nPlease check that it is a valid Parquet file. " "This error can also occur due to stale metadata. " "If you believe this is a valid Parquet file, try running \\\"refresh $2\\\"."), diff --git a/testdata/workloads/functional-query/queries/QueryTest/parquet.test b/testdata/workloads/functional-query/queries/QueryTest/parquet.test index b0b188f36..2aee0d0e7 100644 --- a/testdata/workloads/functional-query/queries/QueryTest/parquet.test +++ b/testdata/workloads/functional-query/queries/QueryTest/parquet.test @@ -50,7 +50,7 @@ bigint,bigint,string,string,boolean,boolean,bigint,bigint,bigint,bigint # Parquet file with invalid magic number SELECT * from bad_magic_number ---- CATCH -File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: XXXX +File '$NAMENODE/test-warehouse/bad_magic_number_parquet/bad_magic_number.parquet' has an invalid Parquet version number: 58 58 58 58 ==== ---- QUERY # count(*) query on parquet file with multiple blocks (one block per node)