Disable RC/Trevni (with option to allow it); remove file_buffer_size

IMP-336: remove file_buffer_size query options
Add "allow_unsupported_formats" query options to allow RC/Trevni in our test; disabled by
default
This commit is contained in:
Alan Choi
2012-09-26 19:04:51 -07:00
committed by Henry Robinson
parent 8d0ee9aebe
commit 0ce8a044e3
17 changed files with 156 additions and 55 deletions

View File

@@ -46,7 +46,6 @@ RuntimeState::RuntimeState()
unreported_error_idx_(0),
profile_(obj_pool_.get(), "<unnamed>") {
query_options_.batch_size = DEFAULT_BATCH_SIZE;
query_options_.file_buffer_size = DEFAULT_FILE_BUFFER_SIZE;
}
RuntimeState::~RuntimeState() {
@@ -70,9 +69,6 @@ Status RuntimeState::Init(
if (query_options_.batch_size <= 0) {
query_options_.batch_size = DEFAULT_BATCH_SIZE;
}
if (query_options.file_buffer_size <= 0) {
query_options_.file_buffer_size = DEFAULT_FILE_BUFFER_SIZE;
}
if (query_options_.max_io_buffers <= 0) {
query_options_.max_io_buffers = DEFAULT_MAX_IO_BUFFERS;
}

View File

@@ -61,7 +61,6 @@ class RuntimeState {
const DescriptorTbl& desc_tbl() const { return *desc_tbl_; }
void set_desc_tbl(DescriptorTbl* desc_tbl) { desc_tbl_ = desc_tbl; }
int batch_size() const { return query_options_.batch_size; }
int file_buffer_size() const { return query_options_.file_buffer_size; }
bool abort_on_error() const { return query_options_.abort_on_error; }
int max_errors() const { return query_options_.max_errors; }
int max_io_buffers() const { return query_options_.max_io_buffers; }
@@ -121,7 +120,6 @@ class RuntimeState {
private:
static const int DEFAULT_BATCH_SIZE = 1024;
static const int DEFAULT_FILE_BUFFER_SIZE = 1024 * 1024;
// This is the number of buffers per disk.
static const int DEFAULT_MAX_IO_BUFFERS = 5;

View File

@@ -1388,18 +1388,20 @@ void ImpalaServer::QueryToTClientRequest(const Query& query,
case TImpalaQueryOptions::MAX_SCAN_RANGE_LENGTH:
request->queryOptions.max_scan_range_length = atol(key_value[1].c_str());
break;
case TImpalaQueryOptions::FILE_BUFFER_SIZE:
request->queryOptions.file_buffer_size = atoi(key_value[1].c_str());
break;
case TImpalaQueryOptions::MAX_IO_BUFFERS:
request->queryOptions.max_io_buffers = atoi(key_value[1].c_str());
break;
case TImpalaQueryOptions::NUM_SCANNER_THREADS:
request->queryOptions.num_scanner_threads = atoi(key_value[1].c_str());
break;
case TImpalaQueryOptions::PARTITION_AGG:
request->queryOptions.partition_agg =
iequals(key_value[1], "true") || iequals(key_value[1], "1");
break;
case TImpalaQueryOptions::ALLOW_UNSUPPORTED_FORMATS:
request->queryOptions.allow_unsupported_formats =
iequals(key_value[1], "true") || iequals(key_value[1], "1");
break;
default:
// We hit this DCHECK(false) if we forgot to add the corresponding entry here
// when we add a new query option.
@@ -1630,9 +1632,6 @@ void ImpalaServer::InitializeConfigVariables() {
case TImpalaQueryOptions::MAX_SCAN_RANGE_LENGTH:
value << default_options.max_scan_range_length;
break;
case TImpalaQueryOptions::FILE_BUFFER_SIZE:
value << default_options.file_buffer_size;
break;
case TImpalaQueryOptions::MAX_IO_BUFFERS:
value << default_options.max_io_buffers;
break;
@@ -1642,6 +1641,9 @@ void ImpalaServer::InitializeConfigVariables() {
case TImpalaQueryOptions::PARTITION_AGG:
value << default_options.partition_agg;
break;
case TImpalaQueryOptions::ALLOW_UNSUPPORTED_FORMATS:
value << default_options.allow_unsupported_formats;
break;
default:
// We hit this DCHECK(false) if we forgot to add the corresponding entry here
// when we add a new query option.

View File

@@ -39,9 +39,6 @@
DEFINE_int32(batch_size, 0,
"batch size to be used by backend; a batch size of 0 indicates the "
"backend's default batch size");
DEFINE_int32(file_buffer_size, 0,
"file buffer size used by text parsing; size of 0 indicates the "
"backend's default file buffer size");
DEFINE_int32(max_scan_range_length, 0,
"maximum length of the scan range; only applicable to HDFS scan range; a length of 0"
" indicates backend default");
@@ -117,9 +114,9 @@ Status InProcessQueryExecutor::Exec(const string& query,
query_options.disable_codegen = !FLAGS_enable_jit;
query_options.max_errors = FLAGS_max_errors;
query_options.num_nodes = FLAGS_num_nodes;
query_options.file_buffer_size = FLAGS_file_buffer_size;
query_options.max_scan_range_length = FLAGS_max_scan_range_length;
query_options.num_scanner_threads = FLAGS_num_scanner_threads;
query_options.allow_unsupported_formats = true;
try {
SCOPED_TIMER(plan_gen_counter);
@@ -325,8 +322,8 @@ Status InProcessQueryExecutor::Explain(const string& query, string* explain_plan
query_options.disable_codegen = !FLAGS_enable_jit;
query_options.max_errors = FLAGS_max_errors;
query_options.num_nodes = FLAGS_num_nodes;
query_options.file_buffer_size = FLAGS_file_buffer_size;
query_options.max_scan_range_length = FLAGS_max_scan_range_length;
query_options.allow_unsupported_formats = true;
TClientRequest client_request;
client_request.__set_stmt(query.c_str());

View File

@@ -33,9 +33,9 @@ struct TQueryOptions {
6: required i32 num_nodes = JavaConstants.NUM_NODES_ALL
7: required i64 max_scan_range_length = 0
8: required i32 file_buffer_size = 0
9: required i32 num_scanner_threads = 0
10: required i32 max_io_buffers = 0
8: required i32 num_scanner_threads = 0
9: required i32 max_io_buffers = 0
10: required bool allow_unsupported_formats = 0
11: required bool partition_agg = 0
}

View File

@@ -44,10 +44,6 @@ enum TImpalaQueryOptions {
// a length of 0 indicates backend default;
MAX_SCAN_RANGE_LENGTH,
// file buffer size used by text parsing; size of 0 indicates the backend's default
// file buffer size
FILE_BUFFER_SIZE,
// Maximum number of io buffers (per disk)
MAX_IO_BUFFERS,
@@ -58,6 +54,9 @@ enum TImpalaQueryOptions {
// consumption, but may require an additional repartitioning step on the grouping
// exprs; ignored if no grouping
PARTITION_AGG,
// If true, Impala will try to execute on file formats that are not fully supported yet
ALLOW_UNSUPPORTED_FORMATS
}
// The summary of an insert.

View File

@@ -37,8 +37,8 @@ const map<ImpalaService.TImpalaQueryOptions, string> DEFAULT_QUERY_OPTIONS = {
ImpalaService.TImpalaQueryOptions.BATCH_SIZE : "0",
ImpalaService.TImpalaQueryOptions.NUM_NODES : "0",
ImpalaService.TImpalaQueryOptions.MAX_SCAN_RANGE_LENGTH : "0",
ImpalaService.TImpalaQueryOptions.FILE_BUFFER_SIZE : "0"
ImpalaService.TImpalaQueryOptions.MAX_IO_BUFFERS : "0"
ImpalaService.TImpalaQueryOptions.NUM_SCANNER_THREADS : "0"
ImpalaService.TImpalaQueryOptions.PARTITION_AGG : "false"
ImpalaService.TImpalaQueryOptions.ALLOW_UNSUPPORTED_FORMATS : "false"
}

View File

@@ -16,9 +16,11 @@ import org.slf4j.LoggerFactory;
import com.cloudera.impala.analysis.Analyzer;
import com.cloudera.impala.analysis.TupleDescriptor;
import com.cloudera.impala.catalog.HdfsFileFormat;
import com.cloudera.impala.catalog.HdfsPartition;
import com.cloudera.impala.catalog.HdfsTable;
import com.cloudera.impala.common.InternalException;
import com.cloudera.impala.common.NotImplementedException;
import com.cloudera.impala.thrift.Constants;
import com.cloudera.impala.thrift.TExplainLevel;
import com.cloudera.impala.thrift.THdfsFileSplit;
@@ -435,4 +437,21 @@ public class HdfsScanNode extends ScanNode {
output.append(super.getExplainString(prefix + " ", detailLevel));
return output.toString();
}
/**
* Raise NotImplementedException if any of the partitions has unsupported file format
* (RC or Trevni).
* Can only be called after finalize().
*/
public void validateFileFormat() throws NotImplementedException {
for (HdfsPartition partition :partitions) {
HdfsFileFormat format = partition.getInputFormatDescriptor().getFileFormat();
if (format == HdfsFileFormat.RC_FILE || format == HdfsFileFormat.TREVNI) {
StringBuilder error = new StringBuilder();
error.append("Table ").append(desc.getTable().getFullName())
.append(" has unsupported format ").append(format.name());
throw new NotImplementedException(error.toString());
}
}
}
}

View File

@@ -100,6 +100,14 @@ public class NewPlanner {
for (PlanFragment fragment: fragments) {
fragment.finalize(analyzer);
if (!queryOptions.allow_unsupported_formats) {
// verify that hdfs partitions only use supported format after partition pruning
ArrayList<HdfsScanNode> hdfsScans = Lists.newArrayList();
fragment.getPlanRoot().collectSubclasses(HdfsScanNode.class, hdfsScans);
for (HdfsScanNode hdfsScanNode: hdfsScans) {
hdfsScanNode.validateFileFormat();
}
}
}
Collections.reverse(fragments);

View File

@@ -783,6 +783,19 @@ public class Planner {
// don't compute mem layout before marking slots that aren't being referenced
analyzer.getDescTbl().computeMemLayout();
if (!queryOptions.allow_unsupported_formats) {
// verify that hdfs partitions only use supported format after partition pruning
ArrayList<HdfsScanNode> hdfsScans = Lists.newArrayList();
if (numNodes == 1) {
root.collectSubclasses(HdfsScanNode.class, hdfsScans);
} else {
slave.collectSubclasses(HdfsScanNode.class, hdfsScans);
}
for (HdfsScanNode hdfsScanNode: hdfsScans) {
hdfsScanNode.validateFileFormat();
}
}
// TODO: determine if slavePlan produces more slots than are being
// ref'd by coordPlan; if so, insert MaterializationNode that trims the
// output

View File

@@ -34,11 +34,9 @@ import com.cloudera.impala.thrift.TQueryExecRequest2;
import com.cloudera.impala.thrift.TQueryOptions;
import com.cloudera.impala.thrift.TScanRangeLocation;
import com.cloudera.impala.thrift.TScanRangeLocations;
import com.cloudera.impala.thrift.TSessionState;
import com.cloudera.impala.thrift.TStmtType;
import com.google.common.base.Preconditions;
import com.cloudera.impala.thrift.TSessionState;
import com.google.common.collect.Lists;
public class NewPlannerTest {
@@ -116,11 +114,10 @@ public class NewPlannerTest {
* locations to actualScanRangeLocations; compares both to the appropriate sections
* of 'testCase'.
*/
private void RunTestCase(
TestCase testCase, StringBuilder errorLog, StringBuilder actualOutput) {
private void RunTestCase(TestCase testCase, TQueryOptions options,
StringBuilder errorLog, StringBuilder actualOutput) {
String query = testCase.getQuery();
LOG.info("running query " + query);
TQueryOptions options = new TQueryOptions();
// single-node plan
ArrayList<String> expectedPlan = testCase.getSectionContents(Section.PLAN);
@@ -227,7 +224,7 @@ public class NewPlannerTest {
// TODO: check that scan range locations are identical in both cases
}
private void runPlannerTestFile(String testFile) {
private void runPlannerTestFile(String testFile, TQueryOptions options) {
String fileName = testDir + "/" + testFile + ".test";
TestFileParser queryFileParser = new TestFileParser(fileName);
StringBuilder actualOutput = new StringBuilder();
@@ -237,7 +234,7 @@ public class NewPlannerTest {
for (TestCase testCase : queryFileParser.getTestCases()) {
actualOutput.append(testCase.getSectionAsString(Section.QUERY, true, "\n"));
actualOutput.append("\n");
RunTestCase(testCase, errorLog, actualOutput);
RunTestCase(testCase, options, errorLog, actualOutput);
actualOutput.append("====\n");
}
@@ -259,6 +256,12 @@ public class NewPlannerTest {
}
}
private void runPlannerTestFile(String testFile) {
TQueryOptions options = new TQueryOptions();
options.allow_unsupported_formats = true;
runPlannerTestFile(testFile, options);
}
@Test
public void testDistinct() {
runPlannerTestFile("distinct");
@@ -284,6 +287,13 @@ public class NewPlannerTest {
runPlannerTestFile("hdfs");
}
@Test
public void testUnsupportedFormat() {
TQueryOptions options = new TQueryOptions();
options.allow_unsupported_formats = false;
runPlannerTestFile("unsupported-hdfs-format", options);
}
@Test
public void testJoins() {
runPlannerTestFile("joins");

View File

@@ -86,14 +86,12 @@ public class PlannerTest {
}
}
private void RunUnimplementedQuery(String query, int numNodes,
StringBuilder errorLog) {
private void RunUnimplementedQuery(String query, TQueryOptions options,
StringBuilder errorLog) {
try {
AnalysisContext.AnalysisResult analysisResult = analysisCtxt.analyze(query);
Planner planner = new Planner();
explainStringBuilder.setLength(0);
TQueryOptions options = new TQueryOptions();
options.setNum_nodes(numNodes);
planner.createPlanFragments(analysisResult, options, explainStringBuilder);
errorLog.append(
@@ -108,7 +106,7 @@ public class PlannerTest {
}
}
private void runPlannerTestFile(String testFile) {
private void runPlannerTestFile(String testFile, TQueryOptions options) {
String fileName = testDir + "/" + testFile + ".test";
TestFileParser queryFileParser = new TestFileParser(fileName);
StringBuilder actualOutput = new StringBuilder();
@@ -129,16 +127,19 @@ public class PlannerTest {
ArrayList<String> singleNodePlan = testCase.getSectionContents(Section.PLAN);
if (singleNodePlan.size() > 0 &&
singleNodePlan.get(0).toLowerCase().startsWith("not implemented")) {
RunUnimplementedQuery(query, 1, errorLog);
options.setNum_nodes(1);
RunUnimplementedQuery(query, options, errorLog);
actualOutput.append("not implemented\n");
} else {
// Run single-node query,
RunQuery(query, 1, testCase, Section.PLAN, errorLog, actualOutput);
// Check if multi-node query is implemented.
ArrayList<String> multiNodePlan = testCase.getSectionContents(Section.DISTRIBUTEDPLAN);
ArrayList<String> multiNodePlan =
testCase.getSectionContents(Section.DISTRIBUTEDPLAN);
if (multiNodePlan.size() > 0 &&
multiNodePlan.get(0).toLowerCase().startsWith("not implemented")) {
RunUnimplementedQuery(query, Constants.NUM_NODES_ALL, errorLog);
options.setNum_nodes(Constants.NUM_NODES_ALL);
RunUnimplementedQuery(query, options, errorLog);
actualOutput.append("not implemented\n");
} else {
actualOutput.append("------------ DISTRIBUTEDPLAN\n");
@@ -172,6 +173,12 @@ public class PlannerTest {
}
}
private void runPlannerTestFile(String testFile) {
TQueryOptions options = new TQueryOptions();
options.allow_unsupported_formats = true;
runPlannerTestFile(testFile, options);
}
@Test
public void testDistinct() {
runPlannerTestFile("distinct");
@@ -197,6 +204,13 @@ public class PlannerTest {
runPlannerTestFile("hdfs");
}
@Test
public void testUnsupportedFormat() {
TQueryOptions options = new TQueryOptions();
options.allow_unsupported_formats = false;
runPlannerTestFile("unsupported-hdfs-format", options);
}
@Test
public void testJoins() {
runPlannerTestFile("joins");

View File

@@ -44,7 +44,7 @@ public class QueryTest extends BaseQueryTest {
// in the data. For sequence this is to the next sync mark for text
// it is the next end of record delimiter.
TestExecContext execContext1 =
new TestExecContext(1, 0, false, true, 0, 5000, 0, false);
new TestExecContext(1, 0, false, true, 0, 5000, false, true);
List<TestConfiguration> testConfigs = Lists.newArrayList();
testConfigs.add(
new TestConfiguration(execContext1, CompressionFormat.NONE, TableFormat.TEXT));
@@ -103,12 +103,12 @@ public class QueryTest extends BaseQueryTest {
// 1. scan range with no tuple
// 2. tuple that span across multiple scan ranges
TestExecContext execContext1 =
new TestExecContext(2, 1, true, true, 0, 1, 0, false);
new TestExecContext(2, 1, true, true, 0, 1, false, false);
// We use a very small file buffer to test the HDFS scanner init code that seeks the
// first tuple delimiter.
TestExecContext execContext2 =
new TestExecContext(2, 1, true, true, 0, 5, 1, false);
new TestExecContext(2, 1, true, true, 0, 5, false, false);
List<TestConfiguration> testConfigs = Lists.newArrayList();
testConfigs.add(

View File

@@ -167,8 +167,8 @@ public class ImpaladClientExecutor {
case MAX_SCAN_RANGE_LENGTH:
optionValue = String.valueOf(queryOptions.getMax_scan_range_length());
break;
case FILE_BUFFER_SIZE:
optionValue = String.valueOf(queryOptions.getFile_buffer_size());
case ALLOW_UNSUPPORTED_FORMATS:
optionValue = String.valueOf(queryOptions.allow_unsupported_formats);
break;
case MAX_IO_BUFFERS:
optionValue = String.valueOf(queryOptions.getMax_io_buffers());

View File

@@ -18,26 +18,27 @@ public class TestExecContext {
public TestExecContext(int numNodes, int batchSize, boolean disableCodegen,
boolean abortOnError, int maxErrors, long maxScanRangeLength,
int fileBufferSize, boolean partitionAgg) {
// TODO: turn on multiple threads by setting that 1 to 0. This doesn't currently
// pass all the tests due to numerical precision issues. With multiple threads
// and a small batch size, aggregation over float columns result in slightly
// different results.
queryOptions = new TQueryOptions(abortOnError, maxErrors, disableCodegen, batchSize,
true, numNodes, maxScanRangeLength, fileBufferSize, 1, 0, false);
boolean partitionAgg, boolean allowUnsupportedFormats) {
queryOptions = new TQueryOptions();
queryOptions.abort_on_error = abortOnError;
queryOptions.max_errors = maxErrors;
queryOptions.disable_codegen = disableCodegen;
queryOptions.batch_size = batchSize;
queryOptions.num_nodes = numNodes;
queryOptions.max_scan_range_length = maxScanRangeLength;
queryOptions.file_buffer_size = fileBufferSize;
// TODO: turn on multiple threads by setting that 1 to 0. This doesn't currently
// pass all the tests due to numerical precision issues. With multiple threads
// and a small batch size, aggregation over float columns result in slightly
// different results.
queryOptions.num_scanner_threads = 1;
queryOptions.max_io_buffers = 0;
queryOptions.allow_unsupported_formats = allowUnsupportedFormats;
queryOptions.partition_agg = partitionAgg;
}
public TestExecContext(int numNodes, int batchSize, boolean disableCodegen,
boolean abortOnError, int maxErrors) {
this(numNodes, batchSize, disableCodegen, abortOnError, maxErrors, 0, 0, false);
this(numNodes, batchSize, disableCodegen, abortOnError, maxErrors, 0, false, true);
}
public TestExecContext(TQueryOptions queryOptions, int fetchSize) {

View File

@@ -0,0 +1,22 @@
// RC is not supported
select * from alltypes_rc
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====
// Trevni is not supported
select * from alltypes_rc
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====
// alltypesmixedformat has RC
// Trevni is not supported
select * from alltypesmixedformat
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====

View File

@@ -0,0 +1,22 @@
// RC is not supported
select * from alltypes_rc
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====
// Trevni is not supported
select * from alltypes_rc
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====
// alltypesmixedformat has RC
// Trevni is not supported
select * from alltypesmixedformat
---- PLAN
not implemented
------------ DISTRIBUTEDPLAN
not implemented
====