IMPALA-12864: Deflake test_query_log_size_in_bytes.

test_query_log_size_in_bytes is flaky in exhaustive builds. The expected QueryStateRecord is off by around 100KB per query, indicating that the test query might be too complex and lead to variability in final query profile that is being archived. This patch simplifies the test query and relaxes the assertion. This patch also adds QUERY_LOG_EST_TOTAL_BYTES metric (key name "impala-server.query-log-est-total-bytes") that is validated as well in test_query_log_size_in_bytes. query-state-record-test.cc is modified a bit to resolve segmentation fault issue when building unified-be-test-validated-executable target run_clang_tidy.sh. Testing: - Pass test_query_log_size_in_bytes in exhaustive build. Change-Id: Ic9b543ffa931a01c11d58e45774ea46af78b3a25 Reviewed-on: http://gerrit.cloudera.org:8080/21100 Reviewed-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 09:58:28 -05:00 · 2024-03-01 13:27:34 -08:00
parent 784971c018
commit ff5cbadec4
10 changed files with 55 additions and 29 deletions
--- a/be/src/service/CMakeLists.txt
+++ b/be/src/service/CMakeLists.txt
@@ -142,5 +142,5 @@ ADD_BE_TEST(session-expiry-test session-expiry-test.cc) # TODO: this leaks thrif
 ADD_UNIFIED_BE_LSAN_TEST(hs2-util-test "StitchNullsTest.*:PrintTColumnValueTest.*")
 ADD_UNIFIED_BE_LSAN_TEST(query-options-test QueryOptions.*)
 ADD_UNIFIED_BE_LSAN_TEST(impala-server-test ImpalaServerTest.*)
-ADD_UNIFIED_BE_LSAN_TEST(query-state-record-test QueryStateRecordTest.*:PerHostStateTest.*)
+ADD_UNIFIED_BE_LSAN_TEST(query-state-record-test QueryStateRecordTest.*)
 ADD_BE_LSAN_TEST(internal-server-test)
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1164,7 +1164,7 @@ void ImpalaServer::ArchiveQuery(const QueryHandle& query_handle) {
  {
    lock_guard<mutex> l(query_log_lock_);
    // Add record to the beginning of the log, and to the lookup index.
-    query_log_est_total_size_ += record_size;
+    ImpaladMetrics::QUERY_LOG_EST_TOTAL_BYTES->Increment(record_size);
    query_log_est_sizes_.push_front(record_size);
    query_log_.push_front(move(record));
    query_log_index_[query_handle->query_id()] = &query_log_.front();
@@ -1172,12 +1172,13 @@ void ImpalaServer::ArchiveQuery(const QueryHandle& query_handle) {
    while (!query_log_.empty()
        && ((FLAGS_query_log_size > -1 && FLAGS_query_log_size < query_log_.size())
            || (FLAGS_query_log_size_in_bytes > -1
-                && FLAGS_query_log_size_in_bytes < query_log_est_total_size_))) {
+                && FLAGS_query_log_size_in_bytes
+                    < ImpaladMetrics::QUERY_LOG_EST_TOTAL_BYTES->GetValue()))) {
      query_log_index_.erase(query_log_.back()->id);
-      query_log_est_total_size_ -= query_log_est_sizes_.back();
+      ImpaladMetrics::QUERY_LOG_EST_TOTAL_BYTES->Increment(-query_log_est_sizes_.back());
      query_log_est_sizes_.pop_back();
      query_log_.pop_back();
-      DCHECK_GE(query_log_est_total_size_, 0);
+      DCHECK_GE(ImpaladMetrics::QUERY_LOG_EST_TOTAL_BYTES->GetValue(), 0);
      DCHECK_EQ(query_log_.size(), query_log_est_sizes_.size());
    }
  }
--- a/be/src/service/impala-server.h
+++ b/be/src/service/impala-server.h
@@ -1133,8 +1133,7 @@ class ImpalaServer : public ImpalaServiceIf,
  /// Random number generator for use in this class, thread safe.
  static ThreadSafeRandom rng_;

-  /// Guards query_log_, query_log_index_, query_log_est_sizes_,
-  /// and query_log_est_total_size_.
+  /// Guards query_log_, query_log_index_, and query_log_est_sizes_.
  std::mutex query_log_lock_;

  /// FIFO list of query records, which are written after the query finishes executing.
@@ -1153,9 +1152,8 @@ class ImpalaServer : public ImpalaServiceIf,
      QueryLogIndex;
  QueryLogIndex query_log_index_;

-  /// Estimated individual and total size of records in query_log_ in bytes.
+  /// Estimated individual size of records in query_log_ in bytes.
  std::list<int64_t> query_log_est_sizes_;
-  int64_t query_log_est_total_size_ = 0;

  /// Sets the given query_record (and retried_query_record too if given) for the given
  /// query_id. Returns an error Status if the given query_id cannot be found in the
--- a/be/src/service/query-state-record-test.cc
+++ b/be/src/service/query-state-record-test.cc
@@ -125,7 +125,7 @@ TEST(QueryStateRecordTest, EventsTimelineIterator) {
  }
 }

-TEST(PerHostStateTest, PeakMemoryComparatorLessThan) {
+TEST(QueryStateRecordTest, PerHostStatePeakMemoryComparatorLessThan) {
  TNetworkAddress addr_a;
  PerHostState a;
  a.peak_memory_usage = std::numeric_limits<int64_t>::min();
@@ -140,15 +140,13 @@ TEST(PerHostStateTest, PeakMemoryComparatorLessThan) {
  EXPECT_FALSE(PerHostPeakMemoryComparator(pair_b, pair_a));
 }

-TEST(PerHostStateTest, PeakMemoryComparatorEqual) {
+TEST(QueryStateRecordTest, PerHostStatePeakMemoryComparatorEqual) {
  TNetworkAddress addr_a;
  PerHostState a;
-  a.peak_memory_usage = 0;
  std::pair<TNetworkAddress, PerHostState> pair_a = std::make_pair(addr_a, a);

  TNetworkAddress addr_b;
  PerHostState b;
-  b.peak_memory_usage = 0;
  std::pair<TNetworkAddress, PerHostState> pair_b = std::make_pair(addr_b, b);

  EXPECT_FALSE(PerHostPeakMemoryComparator(pair_a, pair_b));
--- a/be/src/service/query-state-record.h
+++ b/be/src/service/query-state-record.h
@@ -178,7 +178,7 @@ struct PerHostState {
  int32_t fragment_instance_count = 0;

  // Peak Memory Usage
-  int64_t peak_memory_usage;
+  int64_t peak_memory_usage = 0;
 }; // struct PerHostState

 /// Comparator function that compares two PerHostState structs based on the
--- a/be/src/util/impalad-metrics.cc
+++ b/be/src/util/impalad-metrics.cc
@@ -167,6 +167,8 @@ const char* ImpaladMetricKeys::HEDGED_READ_OPS =
 const char* ImpaladMetricKeys::HEDGED_READ_OPS_WIN =
    "impala-server.hedged-read-ops-win";
 const char* ImpaladMetricKeys::DEBUG_ACTION_NUM_FAIL = "impala.debug_action.fail";
+const char* ImpaladMetricKeys::QUERY_LOG_EST_TOTAL_BYTES =
+    "impala-server.query-log-est-total-bytes";

 // These are created by impala-server during startup.
 // =======
@@ -237,6 +239,7 @@ IntGauge* ImpaladMetrics::NUM_FILES_OPEN_FOR_INSERT = nullptr;
 IntGauge* ImpaladMetrics::NUM_QUERIES_REGISTERED = nullptr;
 IntGauge* ImpaladMetrics::RESULTSET_CACHE_TOTAL_NUM_ROWS = nullptr;
 IntGauge* ImpaladMetrics::RESULTSET_CACHE_TOTAL_BYTES = nullptr;
+IntGauge* ImpaladMetrics::QUERY_LOG_EST_TOTAL_BYTES = nullptr;
 DoubleGauge* ImpaladMetrics::CATALOG_CACHE_AVG_LOAD_TIME = nullptr;
 DoubleGauge* ImpaladMetrics::CATALOG_CACHE_HIT_RATE = nullptr;
 DoubleGauge* ImpaladMetrics::CATALOG_CACHE_LOAD_EXCEPTION_RATE = nullptr;
@@ -342,6 +345,8 @@ void ImpaladMetrics::CreateMetrics(MetricGroup* m) {
      ImpaladMetricKeys::RESULTSET_CACHE_TOTAL_NUM_ROWS, 0);
  RESULTSET_CACHE_TOTAL_BYTES = m->AddGauge(
      ImpaladMetricKeys::RESULTSET_CACHE_TOTAL_BYTES, 0);
+  QUERY_LOG_EST_TOTAL_BYTES =
+      m->AddGauge(ImpaladMetricKeys::QUERY_LOG_EST_TOTAL_BYTES, 0);

  // Initialize scan node metrics
  NUM_RANGES_PROCESSED = m->AddCounter(
--- a/be/src/util/impalad-metrics.h
+++ b/be/src/util/impalad-metrics.h
@@ -261,6 +261,10 @@ class ImpaladMetricKeys {
  /// --debug_actions is set.
  static const char* DEBUG_ACTION_NUM_FAIL;

+  // Estimated total size of query logs that are currently retained in memory.
+  // Associated metric is modified in ImpalaServer::ArchiveQuery and must hold
+  // ImpalaServer::query_log_lock_ on modification.
+  static const char* QUERY_LOG_EST_TOTAL_BYTES;
 };

 /// Global impalad-wide metrics.  This is useful for objects that want to update metrics
@@ -344,6 +348,7 @@ class ImpaladMetrics {
  static IntGauge* NUM_QUERIES_REGISTERED;
  static IntGauge* RESULTSET_CACHE_TOTAL_NUM_ROWS;
  static IntGauge* RESULTSET_CACHE_TOTAL_BYTES;
+  static IntGauge* QUERY_LOG_EST_TOTAL_BYTES;

  // Properties
  static BooleanProperty* CATALOG_READY;
--- a/bin/run_clang_tidy.sh
+++ b/bin/run_clang_tidy.sh
@@ -33,7 +33,9 @@ TMP_BUILDALL_LOG=$(mktemp)
 echo "Compiling, for build logs see ${TMP_BUILDALL_LOG}"
 if ! ./buildall.sh -skiptests -tidy -so -noclean &> "${TMP_BUILDALL_LOG}"
 then
-  echo "buildall.sh failed, dumping output" >&2
+  echo "buildall.sh failed!" >&2
+  grep "^make.* Error " ${TMP_BUILDALL_LOG} >&2
+  echo "Dumping output of ./buildall.sh -skiptests -tidy -so -noclean" >&2
  cat "${TMP_BUILDALL_LOG}"
  exit 1
 fi
--- a/common/thrift/metrics.json
+++ b/common/thrift/metrics.json
@@ -2853,6 +2853,16 @@
    "kind": "HISTOGRAM",
    "key": "impala-server.query-durations-ms"
  },
+  {
+    "description": "Estimated total size of query logs that are currently retained in memory.",
+    "contexts": [
+        "IMPALAD"
+    ],
+    "label": "Query log size in memory.",
+    "units": "BYTES",
+    "kind": "GAUGE",
+    "key": "impala-server.query-log-est-total-bytes"
+  },
  {
    "description": "Distribution of DDL operation latencies",
    "contexts": [
--- a/tests/custom_cluster/test_web_pages.py
+++ b/tests/custom_cluster/test_web_pages.py
@@ -30,6 +30,8 @@ from tests.common.custom_cluster_test_suite import (
  CustomClusterTestSuite)
 from tests.shell.util import run_impala_shell_cmd

+SMALL_QUERY_LOG_SIZE_IN_BYTES = 40 * 1024
+

 class TestWebPage(CustomClusterTestSuite):
  @classmethod
@@ -149,23 +151,28 @@ class TestWebPage(CustomClusterTestSuite):

  @pytest.mark.execute_serially
  @CustomClusterTestSuite.with_args(
-      impalad_args="--query_log_size_in_bytes=" + str(2 * 1024 * 1024)
+      cluster_size=1,
+      impalad_args="--query_log_size_in_bytes=" + str(SMALL_QUERY_LOG_SIZE_IN_BYTES)
  )
  def test_query_log_size_in_bytes(self):
    """Check if query list is limited by query_log_size_in_bytes flag."""
-    # The input query will produce ~627228 bytes of QueryStateRecord in MT_DOP=8.
-    query = ("with l1 as (select id id1 from functional.alltypes), "
-        "l2 as (select a1.id1 id2 from l1 as a1 join l1 as b1 on a1.id1=-b1.id1), "
-        "l3 as (select a2.id2 id3 from l2 as a2 join l2 as b2 on a2.id2=-b2.id2), "
-        "l4 as (select a3.id3 id4 from l3 as a3 join l3 as b3 on a3.id3=-b3.id3), "
-        "l5 as (select a4.id4 id5 from l4 as a4 join l4 as b4 on a4.id4=-b4.id4) "
-        "select * from l5 limit 100;")
-    self.execute_query("SET MT_DOP=8;")
-    for i in range(0, 5):
-      self.execute_query(query)
-    response = requests.get("http://localhost:25000/queries?json")
-    queries_json = json.loads(response.text)
-    assert len(queries_json["completed_queries"]) == 3
+    # This simple test query will produce ~8520 bytes of QueryStateRecord.
+    query = "select version()"
+    num_queries = 10
+    for i in range(0, num_queries):
+      self.execute_query_expect_success(self.client, query)
+
+    # Retrieve and verify the total size metrics.
+    metric_key = "impala-server.query-log-est-total-bytes"
+    metric_value = self.cluster.impalads[0].service.get_metric_value(metric_key)
+    assert metric_value > 0
+    assert metric_value <= SMALL_QUERY_LOG_SIZE_IN_BYTES
+
+    # Verify that the query page only contains a subset of the test queries.
+    queries_response = requests.get("http://localhost:25000/queries?json")
+    queries_json = json.loads(queries_response.text)
+    assert len(queries_json["completed_queries"]) > 0
+    assert len(queries_json["completed_queries"]) < num_queries

  # Checks if 'messages' exists/does not exist in 'result_stderr' based on the value of
  # 'should_exist'