mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
IMPALA-14386: Add benchmarks for Byte Stream Split encoding
This patch adds benchmarks to the Byte Stream Split encoding. It
compares different ways to use the decoder.
I added benchmarks for the following comparisons:
* Compile VS Runtime initialized decoder
* Float VS Int VS Double VS Long VS 6 and 11 byte size types
* Repeating VS Sequential VS Random ordered data
* Decoding one by one VS in batch VS with stride (!= byte_size)
* Small VS Medium (10x small) VS Large (100x small) stride
Conclusions:
* Passing the byte size as a template parameter is almost 5 times
as fast as passing it in the constructor.
* The size of the type heavily influences the speed
* The data variation doesn't influence the speed at all
* Reading values in batch is much faster than one-by-one
* The stride sizes have a small influence on the speed
For more details and graphs, go to
https://docs.google.com/spreadsheets/d/129LwvR6gpZInlRhlVWktn6Haugwo_fnloAAYfI0Qp2s
Change-Id: I708af625348b0643aa3f37525b8a6e74f0c47057
Reviewed-on: http://gerrit.cloudera.org:8080/23401
Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
This commit is contained in:
committed by
Impala Public Jenkins
parent
134c28d445
commit
c4c9adf592
@@ -53,6 +53,7 @@ ADD_BE_BENCHMARK(network-perf-benchmark)
|
||||
ADD_BE_BENCHMARK(overflow-benchmark)
|
||||
ADD_BE_BENCHMARK(parse-timestamp-benchmark)
|
||||
ADD_BE_BENCHMARK(parquet-delta-benchmark)
|
||||
ADD_BE_BENCHMARK(parquet-byte-stream-split-decoder-benchmark)
|
||||
ADD_BE_BENCHMARK(process-wide-locks-benchmark)
|
||||
ADD_BE_BENCHMARK(rle-benchmark)
|
||||
ADD_BE_BENCHMARK(row-batch-serialize-benchmark)
|
||||
|
||||
514
be/src/benchmarks/parquet-byte-stream-split-decoder-benchmark.cc
Normal file
514
be/src/benchmarks/parquet-byte-stream-split-decoder-benchmark.cc
Normal file
@@ -0,0 +1,514 @@
|
||||
/// Licensed to the Apache Software Foundation (ASF) under one
|
||||
/// or more contributor license agreements. See the NOTICE file
|
||||
/// distributed with this work for additional information
|
||||
/// regarding copyright ownership. The ASF licenses this file
|
||||
/// to you under the Apache License, Version 2.0 (the
|
||||
/// "License"); you may not use this file except in compliance
|
||||
/// with the License. You may obtain a copy of the License at
|
||||
///
|
||||
/// http://www.apache.org/licenses/LICENSE-2.0
|
||||
///
|
||||
/// Unless required by applicable law or agreed to in writing,
|
||||
/// software distributed under the License is distributed on an
|
||||
/// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
|
||||
/// KIND, either express or implied. See the License for the
|
||||
/// specific language governing permissions and limitations
|
||||
/// under the License.
|
||||
|
||||
#include <cstdint>
|
||||
#include <iostream>
|
||||
|
||||
#include "exec/parquet/parquet-byte-stream-split-decoder.h"
|
||||
#include "exec/parquet/parquet-byte-stream-split-encoder.h"
|
||||
#include "util/benchmark.h"
|
||||
#include "util/cpu-info.h"
|
||||
|
||||
using namespace impala;
|
||||
|
||||
constexpr int DATA_BATCH_SIZE = 1000;
|
||||
|
||||
// -------------------------------- Benchmark Results --------------------------------- //
|
||||
|
||||
// Machine Info: 13th Gen Intel(R) Core(TM) i9-13900
|
||||
// Data Batch Size = 1000
|
||||
// Data Pool Size for Pooled Data = 124
|
||||
// Skip Sizes (Read | Skip): 82 | 18
|
||||
// Stride Sizes (S | M | L): 15 | 2985 | 213525
|
||||
|
||||
// ━━━━━━━━━━━━━━━━━━━━━ Byte Stream Split functionality comparison ━━━━━━━━━━━━━━━━━━━━━━
|
||||
|
||||
// ────────────────────── Compile VS Runtime | Sequential | Batched ──────────────────────
|
||||
// Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
|
||||
// (relative) (relative) (relative)
|
||||
// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
||||
// Compile Int 2.46e+03 2.49e+03 2.52e+03 1X 1X 1X
|
||||
// Runtime Int 467 470 475 0.19X 0.189X 0.188X
|
||||
// Compile Long 1.17e+03 1.19e+03 1.21e+03 0.476X 0.479X 0.48X
|
||||
// Runtime Long 200 202 203 0.0811X 0.0811X 0.0806X
|
||||
|
||||
|
||||
// ───────────────────── Type Comparison | Runtime | Random | Batched ────────────────────
|
||||
// Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
|
||||
// (relative) (relative) (relative)
|
||||
// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
||||
// Int 452 470 474 1X 1X 1X
|
||||
// Float 453 469 474 1X 0.998X 1X
|
||||
// 6 bytes 269 283 284 0.596X 0.602X 0.6X
|
||||
// Long 194 202 203 0.429X 0.429X 0.429X
|
||||
// Double 194 202 203 0.429X 0.429X 0.429X
|
||||
// 11 bytes 137 141 142 0.304X 0.3X 0.3X
|
||||
|
||||
|
||||
// ────────────── Repeating VS Sequential VS Random | Compile Time | Batched ─────────────
|
||||
// Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
|
||||
// (relative) (relative) (relative)
|
||||
// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
||||
// Repeating Int 2.36e+03 2.47e+03 2.51e+03 1X 1X 1X
|
||||
// Sequential Int 2.41e+03 2.48e+03 2.52e+03 1.02X 1X 1X
|
||||
// Random Int 2.4e+03 2.49e+03 2.52e+03 1.02X 1.01X 1X
|
||||
// Repeating Long 1.16e+03 1.18e+03 1.22e+03 0.491X 0.479X 0.484X
|
||||
// Sequential Long 1.15e+03 1.19e+03 1.21e+03 0.486X 0.479X 0.48X
|
||||
// Random Long 1.14e+03 1.18e+03 1.21e+03 0.484X 0.477X 0.481X
|
||||
|
||||
|
||||
// ──────────────── Singles VS Batch VS Stride | Compile Time | Sequential ───────────────
|
||||
// Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
|
||||
// (relative) (relative) (relative)
|
||||
// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
||||
// Singles Int 1.24e+03 1.27e+03 1.28e+03 1X 1X 1X
|
||||
// Batch Int 2.42e+03 2.48e+03 2.51e+03 1.95X 1.95X 1.96X
|
||||
// Stride Int 2.41e+03 2.49e+03 2.51e+03 1.94X 1.95X 1.96X
|
||||
// Singles Long 812 827 837 0.653X 0.65X 0.652X
|
||||
// Batch Long 1.16e+03 1.19e+03 1.21e+03 0.934X 0.931X 0.941X
|
||||
// Stride Long 1.18e+03 1.21e+03 1.23e+03 0.949X 0.954X 0.962X
|
||||
|
||||
|
||||
// ──────── Small VS Medium VS Large Stride | Compile Time | Sequential | Batched ────────
|
||||
// Function iters/ms 10%ile 50%ile 90%ile 10%ile 50%ile 90%ile
|
||||
// (relative) (relative) (relative)
|
||||
// ┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄┄
|
||||
// S Stride Int 2.41e+03 2.49e+03 2.52e+03 1X 1X 1X
|
||||
// M Stride Int 1.92e+03 2e+03 2.03e+03 0.795X 0.804X 0.806X
|
||||
// L Stride Int 1.87e+03 1.92e+03 1.95e+03 0.774X 0.772X 0.774X
|
||||
// S Stride Long 1.16e+03 1.22e+03 1.23e+03 0.481X 0.488X 0.49X
|
||||
// M Stride Long 1.01e+03 1.08e+03 1.09e+03 0.419X 0.434X 0.433X
|
||||
// L Stride Long 987 1.03e+03 1.04e+03 0.409X 0.413X 0.414X
|
||||
|
||||
// --------------------------------- Data Structures ---------------------------------- //
|
||||
|
||||
template <int B_SIZE>
|
||||
struct BSSTestData {
|
||||
const std::vector<uint8_t>& input_bdata;
|
||||
const int stride;
|
||||
|
||||
std::vector<uint8_t> encoded_bdata;
|
||||
std::vector<uint8_t> output;
|
||||
|
||||
BSSTestData(const std::vector<uint8_t>& b, int s = B_SIZE) : input_bdata(b), stride(s) {
|
||||
output.resize(stride * (input_bdata.size() / B_SIZE));
|
||||
GenerateBSSEncoded();
|
||||
}
|
||||
|
||||
private:
|
||||
void GenerateBSSEncoded() {
|
||||
ParquetByteStreamSplitEncoder<0> encoder(B_SIZE);
|
||||
std::vector<uint8_t> temp(input_bdata.size());
|
||||
encoded_bdata.resize(input_bdata.size());
|
||||
encoder.NewPage(temp.data(), temp.size());
|
||||
for (int i = 0; i < input_bdata.size() / B_SIZE; i++) {
|
||||
if (!encoder.PutBytes(input_bdata.data() + i * B_SIZE)) {
|
||||
std::cerr << "Error: Value could not be put at ind " << i << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
if (encoder.FinalizePage(encoded_bdata.data(), encoded_bdata.size())
|
||||
!= input_bdata.size() / B_SIZE) {
|
||||
std::cerr << "Error: Could not write all values upon FinalizePage" << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
// --------------------------------- Helper Functions --------------------------------- //
|
||||
|
||||
// ............ Data Generator Functions ............ //
|
||||
|
||||
// Fill the vector with the same repeating data. (42,42,42,42,...)
|
||||
void DataSameRepGen(std::vector<uint8_t>* bdata, int b_size) {
|
||||
for (int i = 0; i < DATA_BATCH_SIZE * b_size; i++) {
|
||||
bdata->push_back(0x75);
|
||||
}
|
||||
}
|
||||
|
||||
// Fill the vector with sequential data (41,42,43,44,...).
|
||||
void DataSequentialGen(std::vector<uint8_t>* bdata, int b_size) {
|
||||
bdata->resize(DATA_BATCH_SIZE * b_size);
|
||||
long offset = rand() * rand() - DATA_BATCH_SIZE;
|
||||
for (int i = 0; i < DATA_BATCH_SIZE; i++) {
|
||||
long j = i + offset;
|
||||
memcpy(bdata->data() + i * b_size, &j, std::min((int)sizeof(j), b_size));
|
||||
}
|
||||
}
|
||||
|
||||
// Fill the vector with completely random data.
|
||||
void DataRandGen(std::vector<uint8_t>* bdata, int b_size) {
|
||||
srand(154698135);
|
||||
for (int i = 0; i < DATA_BATCH_SIZE * b_size; i++) {
|
||||
bdata->push_back(rand() % numeric_limits<uint8_t>::max());
|
||||
}
|
||||
}
|
||||
|
||||
// .......... Benchmark Data Transformer Functions .......... //
|
||||
|
||||
template <int BSIZE>
|
||||
std::vector<uint8_t> GenerateStrided(const std::vector<uint8_t>& input, int stride) {
|
||||
std::vector<uint8_t> strided_bd(input.size() / BSIZE * stride);
|
||||
for (int i = 0; i < input.size() / BSIZE; i++) {
|
||||
memcpy(strided_bd.data() + i * stride, input.data() + i * BSIZE, BSIZE);
|
||||
}
|
||||
return strided_bd;
|
||||
}
|
||||
|
||||
// ........... Output Checking Functions ............ //
|
||||
|
||||
// We could use operator== instead of this, but using this function gives better
|
||||
// readability, and makes debugging easier.
|
||||
void testOutputCorrectness(
|
||||
const std::vector<uint8_t>& output, const std::vector<uint8_t>& expected) {
|
||||
if (output.size() != expected.size()) {
|
||||
std::cerr << "Vector sizes do not match" << std::endl;
|
||||
std::cerr << "Output size (bytes): " << output.size() <<
|
||||
", Expected size (bytes): " << expected.size() << std::endl;
|
||||
return;
|
||||
}
|
||||
for (int i = 0; i < expected.size(); i++) {
|
||||
if (output[i] != expected[i]) {
|
||||
std::cerr << "Vectors do not match at index " << i << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------ Benchmarked Functions ------------------------------- //
|
||||
|
||||
// ................... BSS Tests .................... //
|
||||
|
||||
template <int B_SIZE, class ParquetByteStreamSplitDecoder>
|
||||
void BSS_DecodeBatch(int batch_size, void* d, ParquetByteStreamSplitDecoder& decoder) {
|
||||
BSSTestData<B_SIZE>* data = reinterpret_cast<BSSTestData<B_SIZE>*>(d);
|
||||
|
||||
for (int batch = 0; batch < batch_size; batch++) {
|
||||
uint8_t* output_ptr = data->output.data();
|
||||
decoder.NewPage(data->encoded_bdata.data(), data->encoded_bdata.size());
|
||||
|
||||
if (decoder.NextValues(data->encoded_bdata.size() / B_SIZE, output_ptr, B_SIZE)
|
||||
!= data->encoded_bdata.size() / B_SIZE) {
|
||||
std::cerr << "Error: Could not decode all values" << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int B_SIZE>
|
||||
void BSSRun_DecodeBatch(int batch_size, void* d) {
|
||||
ParquetByteStreamSplitDecoder<0> decoder(B_SIZE);
|
||||
BSS_DecodeBatch<B_SIZE>(batch_size, d, decoder);
|
||||
}
|
||||
|
||||
template <int B_SIZE>
|
||||
void BSSComp_DecodeBatch(int batch_size, void* d) {
|
||||
ParquetByteStreamSplitDecoder<B_SIZE> decoder;
|
||||
BSS_DecodeBatch<B_SIZE>(batch_size, d, decoder);
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void BSSComp_DecodeSingles(int batch_size, void* d) {
|
||||
BSSTestData<sizeof(T)>* data = reinterpret_cast<BSSTestData<sizeof(T)>*>(d);
|
||||
ParquetByteStreamSplitDecoder<sizeof(T)> decoder;
|
||||
|
||||
for (int batch = 0; batch < batch_size; batch++) {
|
||||
uint8_t* output_ptr = data->output.data();
|
||||
decoder.NewPage(data->encoded_bdata.data(), data->encoded_bdata.size());
|
||||
for (int j = 0; j < data->encoded_bdata.size() / sizeof(T); j++) {
|
||||
if (decoder.NextValue(reinterpret_cast<T*>(output_ptr)) != 1) {
|
||||
std::cerr << "Error: Could not decode all values" << std::endl;
|
||||
return;
|
||||
}
|
||||
output_ptr += sizeof(T);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int B_SIZE>
|
||||
void BSSComp_DecodeStride(int batch_size, void* d) {
|
||||
BSSTestData<B_SIZE>* data = reinterpret_cast<BSSTestData<B_SIZE>*>(d);
|
||||
ParquetByteStreamSplitDecoder<B_SIZE> decoder;
|
||||
|
||||
for (int batch = 0; batch < batch_size; batch++) {
|
||||
uint8_t* output_ptr = data->output.data();
|
||||
decoder.NewPage(data->encoded_bdata.data(), data->encoded_bdata.size());
|
||||
if (decoder.NextValues(data->encoded_bdata.size() / B_SIZE, output_ptr, data->stride)
|
||||
!= data->encoded_bdata.size() / B_SIZE) {
|
||||
std::cerr << "Error: Could not decode all values" << std::endl;
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
template <int B_SIZE, int READ, int SKIP>
|
||||
void BSSComp_DecodeSkip(int batch_size, void* d) {
|
||||
BSSTestData<B_SIZE>* data = reinterpret_cast<BSSTestData<B_SIZE>*>(d);
|
||||
ParquetByteStreamSplitDecoder<B_SIZE> decoder;
|
||||
|
||||
for (int batch = 0; batch < batch_size; batch++) {
|
||||
uint8_t* output_ptr = data->output.data();
|
||||
decoder.NewPage(data->encoded_bdata.data(), data->encoded_bdata.size());
|
||||
for (int i = 0; i < decoder.GetTotalValueCount(); i += READ + SKIP) {
|
||||
if (decoder.NextValues(READ, output_ptr, B_SIZE) < 0) {
|
||||
std::cerr << "Error reading values at index " << i << std::endl;
|
||||
return;
|
||||
}
|
||||
if (decoder.SkipValues(SKIP) < 0) {
|
||||
std::cerr << "Error skipping values at index " << i << std::endl;
|
||||
return;
|
||||
}
|
||||
output_ptr += READ * B_SIZE;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ------------------------------- Benchmark Functions -------------------------------- //
|
||||
|
||||
// ................. BSS Benchmarks ................. //
|
||||
|
||||
void CompileVSRuntime() {
|
||||
std::vector<uint8_t> byte_data4b;
|
||||
std::vector<uint8_t> byte_data8b;
|
||||
DataSequentialGen(&byte_data4b, 4);
|
||||
DataSequentialGen(&byte_data8b, 8);
|
||||
|
||||
BSSTestData<4> dataIntTempl(byte_data4b);
|
||||
BSSTestData<8> dataLongTempl(byte_data8b);
|
||||
BSSTestData<4> dataIntConstr(byte_data4b);
|
||||
BSSTestData<8> dataLongConstr(byte_data8b);
|
||||
|
||||
// Compile - template, Runtime - constructor
|
||||
Benchmark suite("Compile VS Runtime | Sequential | Batched");
|
||||
suite.AddBenchmark("Compile Int", BSSComp_DecodeBatch<sizeof(int)>, &dataIntConstr);
|
||||
suite.AddBenchmark("Runtime Int", BSSRun_DecodeBatch<sizeof(int)>, &dataIntTempl);
|
||||
suite.AddBenchmark("Compile Long", BSSComp_DecodeBatch<sizeof(long)>, &dataLongConstr);
|
||||
suite.AddBenchmark("Runtime Long", BSSRun_DecodeBatch<sizeof(long)>, &dataLongTempl);
|
||||
std::cout << suite.Measure();
|
||||
|
||||
// Test the output data to make sure that the functions are not optimised out
|
||||
|
||||
testOutputCorrectness(dataIntTempl.output, dataIntTempl.input_bdata);
|
||||
testOutputCorrectness(dataLongTempl.output, dataLongTempl.input_bdata);
|
||||
testOutputCorrectness(dataIntConstr.output, dataIntConstr.input_bdata);
|
||||
testOutputCorrectness(dataLongConstr.output, dataLongConstr.input_bdata);
|
||||
}
|
||||
|
||||
void TypeComparison() {
|
||||
std::vector<uint8_t> byte_data4b;
|
||||
std::vector<uint8_t> byte_data8b;
|
||||
std::vector<uint8_t> byte_data6b;
|
||||
std::vector<uint8_t> byte_data11b;
|
||||
|
||||
DataRandGen(&byte_data4b, 4);
|
||||
DataRandGen(&byte_data6b, 6);
|
||||
DataRandGen(&byte_data8b, 8);
|
||||
DataRandGen(&byte_data11b, 11);
|
||||
|
||||
BSSTestData<4> dataInt(byte_data4b);
|
||||
BSSTestData<8> dataLong(byte_data8b);
|
||||
BSSTestData<6> data6b(byte_data6b);
|
||||
BSSTestData<4> dataFloat(byte_data4b);
|
||||
BSSTestData<8> dataDouble(byte_data8b);
|
||||
BSSTestData<11> data11b(byte_data11b);
|
||||
|
||||
// Since we are comparing types that are not a size of 4 or 8, we must use the runtime
|
||||
// version.
|
||||
Benchmark suite("Type Comparison | Runtime | Random | Batched");
|
||||
suite.AddBenchmark("Int", BSSRun_DecodeBatch<sizeof(int)>, &dataInt);
|
||||
suite.AddBenchmark("Float", BSSRun_DecodeBatch<sizeof(float)>, &dataFloat);
|
||||
suite.AddBenchmark("6 bytes", BSSRun_DecodeBatch<6>, &data6b);
|
||||
suite.AddBenchmark("Long", BSSRun_DecodeBatch<sizeof(long)>, &dataLong);
|
||||
suite.AddBenchmark("Double", BSSRun_DecodeBatch<sizeof(double)>, &dataDouble);
|
||||
suite.AddBenchmark("11 bytes", BSSRun_DecodeBatch<11>, &data11b);
|
||||
std::cout << suite.Measure();
|
||||
|
||||
// Test the output data to make sure that the functions are not optimised out
|
||||
|
||||
testOutputCorrectness(dataInt.output, dataInt.input_bdata);
|
||||
testOutputCorrectness(dataLong.output, dataLong.input_bdata);
|
||||
testOutputCorrectness(data6b.output, data6b.input_bdata);
|
||||
testOutputCorrectness(dataFloat.output, dataFloat.input_bdata);
|
||||
testOutputCorrectness(dataDouble.output, dataDouble.input_bdata);
|
||||
testOutputCorrectness(data11b.output, data11b.input_bdata);
|
||||
}
|
||||
|
||||
void RepeatingVSSequentialVSRandom() {
|
||||
std::vector<uint8_t> repeating_data4b;
|
||||
std::vector<uint8_t> repeating_data8b;
|
||||
std::vector<uint8_t> sequential_data4b;
|
||||
std::vector<uint8_t> sequential_data8b;
|
||||
std::vector<uint8_t> random_data4b;
|
||||
std::vector<uint8_t> random_data8b;
|
||||
|
||||
DataSameRepGen(&repeating_data4b, 4);
|
||||
DataSameRepGen(&repeating_data8b, 8);
|
||||
DataSequentialGen(&sequential_data4b, 4);
|
||||
DataSequentialGen(&sequential_data8b, 8);
|
||||
DataRandGen(&random_data4b, 4);
|
||||
DataRandGen(&random_data8b, 8);
|
||||
|
||||
BSSTestData<4> dataIntRep(repeating_data4b);
|
||||
BSSTestData<8> dataLongRep(repeating_data8b);
|
||||
|
||||
BSSTestData<4> dataIntSeq(sequential_data4b);
|
||||
BSSTestData<8> dataLongSeq(sequential_data8b);
|
||||
|
||||
BSSTestData<4> dataIntRand(random_data4b);
|
||||
BSSTestData<8> dataLongRand(random_data8b);
|
||||
|
||||
Benchmark suite("Repeating VS Sequential VS Random | Compile Time | Batched");
|
||||
suite.AddBenchmark("Repeating Int", BSSComp_DecodeBatch<sizeof(int)>, &dataIntRep);
|
||||
suite.AddBenchmark("Sequential Int", BSSComp_DecodeBatch<sizeof(int)>, &dataIntSeq);
|
||||
suite.AddBenchmark("Random Int", BSSComp_DecodeBatch<sizeof(int)>, &dataIntRand);
|
||||
suite.AddBenchmark("Repeating Long", BSSComp_DecodeBatch<sizeof(long)>, &dataLongRep);
|
||||
suite.AddBenchmark("Sequential Long", BSSComp_DecodeBatch<sizeof(long)>, &dataLongSeq);
|
||||
suite.AddBenchmark("Random Long", BSSComp_DecodeBatch<sizeof(long)>, &dataLongRand);
|
||||
std::cout << suite.Measure();
|
||||
|
||||
// Test the output data to make sure that the functions are not optimised out
|
||||
|
||||
testOutputCorrectness(dataIntRep.output, dataIntRep.input_bdata);
|
||||
testOutputCorrectness(dataLongRep.output, dataLongRep.input_bdata);
|
||||
testOutputCorrectness(dataIntSeq.output, dataIntSeq.input_bdata);
|
||||
testOutputCorrectness(dataLongSeq.output, dataLongSeq.input_bdata);
|
||||
testOutputCorrectness(dataIntRand.output, dataIntRand.input_bdata);
|
||||
testOutputCorrectness(dataLongRand.output, dataLongRand.input_bdata);
|
||||
}
|
||||
|
||||
void SinglesVSBatchVSStride() {
|
||||
std::vector<uint8_t> byte_data4b;
|
||||
std::vector<uint8_t> byte_data8b;
|
||||
DataSequentialGen(&byte_data4b, 4);
|
||||
DataSequentialGen(&byte_data8b, 8);
|
||||
|
||||
BSSTestData<4> dataIntSingles(byte_data4b);
|
||||
BSSTestData<8> dataLongSingles(byte_data8b);
|
||||
|
||||
BSSTestData<4> dataIntBatch(byte_data4b);
|
||||
BSSTestData<8> dataLongBatch(byte_data8b);
|
||||
|
||||
constexpr int stride = sizeof(int) + sizeof(long) + 7;
|
||||
|
||||
BSSTestData<4> dataIntStride(byte_data4b, stride);
|
||||
BSSTestData<8> dataLongStride(byte_data8b, stride);
|
||||
|
||||
Benchmark suite("Singles VS Batch VS Stride | Compile Time | Sequential");
|
||||
suite.AddBenchmark("Singles Int", BSSComp_DecodeSingles<int>, &dataIntSingles);
|
||||
suite.AddBenchmark("Batch Int", BSSComp_DecodeBatch<sizeof(int)>, &dataIntBatch);
|
||||
suite.AddBenchmark("Stride Int", BSSComp_DecodeStride<sizeof(int)>, &dataIntStride);
|
||||
suite.AddBenchmark("Singles Long", BSSComp_DecodeSingles<long>, &dataLongSingles);
|
||||
suite.AddBenchmark("Batch Long", BSSComp_DecodeBatch<sizeof(long)>, &dataLongBatch);
|
||||
suite.AddBenchmark("Stride Long", BSSComp_DecodeStride<sizeof(long)>, &dataLongStride);
|
||||
std::cout << suite.Measure();
|
||||
|
||||
// Test the output data to make sure that the functions are not optimised out
|
||||
|
||||
testOutputCorrectness(dataIntSingles.output, dataIntSingles.input_bdata);
|
||||
testOutputCorrectness(dataLongSingles.output, dataLongSingles.input_bdata);
|
||||
|
||||
testOutputCorrectness(dataIntBatch.output, dataIntBatch.input_bdata);
|
||||
testOutputCorrectness(dataLongBatch.output, dataLongBatch.input_bdata);
|
||||
|
||||
testOutputCorrectness(dataIntStride.output,
|
||||
GenerateStrided<sizeof(int)>(dataIntStride.input_bdata, dataIntStride.stride));
|
||||
testOutputCorrectness(dataLongStride.output,
|
||||
GenerateStrided<sizeof(long)>(dataLongStride.input_bdata, dataLongStride.stride));
|
||||
}
|
||||
|
||||
void StrideSizeComparison(int strideS, int strideM, int strideL) {
|
||||
std::vector<uint8_t> byte_data4b;
|
||||
std::vector<uint8_t> byte_data8b;
|
||||
|
||||
DataSequentialGen(&byte_data4b, 4);
|
||||
DataSequentialGen(&byte_data8b, 8);
|
||||
|
||||
BSSTestData<4> dataIntSStride(byte_data4b, strideS);
|
||||
BSSTestData<4> dataIntMStride(byte_data4b, strideM);
|
||||
BSSTestData<4> dataIntLStride(byte_data4b, strideL);
|
||||
BSSTestData<8> dataLongSStride(byte_data8b, strideS);
|
||||
BSSTestData<8> dataLongMStride(byte_data8b, strideM);
|
||||
BSSTestData<8> dataLongLStride(byte_data8b, strideL);
|
||||
|
||||
Benchmark suite("Small VS Medium VS Large Stride | Compile Time | Sequential | Batched");
|
||||
suite.AddBenchmark("S Stride Int", BSSComp_DecodeStride<sizeof(int)>, &dataIntSStride);
|
||||
suite.AddBenchmark("M Stride Int", BSSComp_DecodeStride<sizeof(int)>, &dataIntMStride);
|
||||
suite.AddBenchmark("L Stride Int", BSSComp_DecodeStride<sizeof(int)>, &dataIntLStride);
|
||||
suite.AddBenchmark("S Stride Long", BSSComp_DecodeStride<sizeof(long)>,
|
||||
&dataLongSStride);
|
||||
suite.AddBenchmark("M Stride Long", BSSComp_DecodeStride<sizeof(long)>,
|
||||
&dataLongMStride);
|
||||
suite.AddBenchmark("L Stride Long", BSSComp_DecodeStride<sizeof(long)>,
|
||||
&dataLongLStride);
|
||||
std::cout << suite.Measure();
|
||||
|
||||
// Test the output data to make sure that the functions are not optimised out
|
||||
|
||||
testOutputCorrectness(dataIntSStride.output,
|
||||
GenerateStrided<sizeof(int)>(dataIntSStride.input_bdata, dataIntSStride.stride));
|
||||
testOutputCorrectness(dataIntMStride.output,
|
||||
GenerateStrided<sizeof(int)>(dataIntMStride.input_bdata, dataIntMStride.stride));
|
||||
testOutputCorrectness(dataIntLStride.output,
|
||||
GenerateStrided<sizeof(int)>(dataIntLStride.input_bdata, dataIntLStride.stride));
|
||||
|
||||
testOutputCorrectness(dataLongSStride.output,
|
||||
GenerateStrided<sizeof(long)>(dataLongSStride.input_bdata, dataLongSStride.stride));
|
||||
testOutputCorrectness(dataLongMStride.output,
|
||||
GenerateStrided<sizeof(long)>(dataLongMStride.input_bdata, dataLongMStride.stride));
|
||||
testOutputCorrectness(dataLongLStride.output,
|
||||
GenerateStrided<sizeof(long)>(dataLongLStride.input_bdata, dataLongLStride.stride));
|
||||
}
|
||||
|
||||
// ---------------------------------- Main Function ----------------------------------- //
|
||||
|
||||
int main(int argc, char** argv) {
|
||||
constexpr int pool = 124;
|
||||
constexpr int strideS = sizeof(int) + sizeof(long) + 3;
|
||||
constexpr int strideM = 199 * strideS;
|
||||
constexpr int strideL = 14235 * strideS;
|
||||
constexpr int read = 82;
|
||||
constexpr int skip = 18;
|
||||
|
||||
CpuInfo::Init();
|
||||
std::cout << " " << Benchmark::GetMachineInfo() << std::endl;
|
||||
std::cout << " Data Batch Size = " << DATA_BATCH_SIZE
|
||||
<< std::endl;
|
||||
std::cout << " Data Pool Size for Pooled Data = " << pool << std::endl;
|
||||
std::cout << " Skip Sizes (Read | Skip): " <<
|
||||
read << " | " << skip << std::endl;
|
||||
std::cout << " Stride Sizes (S | M | L): " <<
|
||||
strideS << " | " << strideM << " | " << strideL << std::endl;
|
||||
std::cout << "\n\n";
|
||||
|
||||
std::cout << "\n\n";
|
||||
std::cout << "━━━━━━━━━━━━━━━━━━━━━ Byte Stream Split functionality comparison "
|
||||
<< "━━━━━━━━━━━━━━━━━━━━━━\n";
|
||||
std::cout << "\n";
|
||||
|
||||
CompileVSRuntime();
|
||||
std::cout << "\n\n";
|
||||
TypeComparison();
|
||||
std::cout << "\n\n";
|
||||
RepeatingVSSequentialVSRandom();
|
||||
std::cout << "\n\n";
|
||||
SinglesVSBatchVSStride();
|
||||
std::cout << "\n\n";
|
||||
StrideSizeComparison(strideS, strideM, strideL);
|
||||
std::cout << "\n\n";
|
||||
|
||||
return 0;
|
||||
}
|
||||
Reference in New Issue
Block a user