IMPALA-6348: Redact only sensitive fields in runtime profiles

Without this patch, redaction is applied to every field in the runtime profile. This approach has an undesired side effect when Kerberos auth + email redaction is in place. Since the redaction applies to every field, even principals (from Connected/Delegated User fields) are redacted, as the Kerberos principal format generally pattern matches with an email redactor template. This is particularly problematic for monitoring tools that consume runtime profiles and use these fields to group the queries by user. This patch fixes the problem by redacting only the following sensitive fields. - Query Statement - Error logs (since they can contain column references etc.) - Query Status - Query Plan Other fields in the runtime profile are left unredacted. Change-Id: Iae3b6726009bf458a7ec73131e5d659b12ab73cf Reviewed-on: http://gerrit.cloudera.org:8080/8934 Reviewed-by: Bharath Vissapragada <bharathv@cloudera.com> Tested-by: Impala Public Jenkins
2025-12-25 02:03:09 -05:00 · 2018-01-03 15:31:32 -08:00
parent ce65b43d47
commit 6a87eb20a5
6 changed files with 56 additions and 13 deletions
--- a/be/src/service/client-request-state.cc
+++ b/be/src/service/client-request-state.cc
@@ -124,7 +124,8 @@ ClientRequestState::ClientRequestState(
  summary_profile_->AddInfoString("Network Address",
      lexical_cast<string>(session_->network_address));
  summary_profile_->AddInfoString("Default Db", default_db());
-  summary_profile_->AddInfoString("Sql Statement", query_ctx_.client_request.stmt);
+  summary_profile_->AddInfoStringRedacted(
+      "Sql Statement", query_ctx_.client_request.stmt);
  summary_profile_->AddInfoString("Coordinator",
      TNetworkAddressToString(exec_env->backend_address()));
 }
@@ -419,7 +420,7 @@ Status ClientRequestState::ExecQueryOrDmlRequest(
    plan_ss << "\n----------------\n"
            << query_exec_request.query_plan
            << "----------------";
-    summary_profile_->AddInfoString("Plan", plan_ss.str());
+    summary_profile_->AddInfoStringRedacted("Plan", plan_ss.str());
  }
  // Add info strings consumed by CM: Estimated mem and tables missing stats.
  if (query_exec_request.__isset.per_host_mem_estimate) {
@@ -741,7 +742,7 @@ Status ClientRequestState::UpdateQueryStatus(const Status& status) {
  if (!status.ok() && query_status_.ok()) {
    UpdateQueryState(beeswax::QueryState::EXCEPTION);
    query_status_ = status;
-    summary_profile_->AddInfoString("Query Status", query_status_.GetDetail());
+    summary_profile_->AddInfoStringRedacted("Query Status", query_status_.GetDetail());
  }

  return status;
--- a/be/src/service/client-request-state.h
+++ b/be/src/service/client-request-state.h
@@ -296,10 +296,18 @@ class ClientRequestState {
  /// * server_profile_ tracks time spent inside the ImpalaServer,
  ///   but not inside fragment execution, i.e. the time taken to
  ///   register and set-up the query and for rows to be fetched.
-  //
+  ///
  /// There's a fourth profile which is not built here (but is a
  /// child of profile_); the execution profile which tracks the
  /// actual fragment execution.
+  ///
+  /// Redaction: Only the following info strings in the profile are redacted as they
+  /// are expected to contain sensitive information like schema/column references etc.
+  /// Other fields are left unredacted.
+  /// - Query Statement
+  /// - Query Plan
+  /// - Query Status
+  /// - Error logs
  RuntimeProfile* const profile_;
  RuntimeProfile* const server_profile_;
  RuntimeProfile* const summary_profile_;
--- a/be/src/service/impala-server.cc
+++ b/be/src/service/impala-server.cc
@@ -1030,8 +1030,8 @@ Status ImpalaServer::UnregisterQuery(const TUniqueId& query_id, bool check_infli
    TExecSummary t_exec_summary;
    request_state->coord()->GetTExecSummary(&t_exec_summary);
    string exec_summary = PrintExecSummary(t_exec_summary);
-    request_state->summary_profile()->AddInfoString("ExecSummary", exec_summary);
-    request_state->summary_profile()->AddInfoString("Errors",
+    request_state->summary_profile()->AddInfoStringRedacted("ExecSummary", exec_summary);
+    request_state->summary_profile()->AddInfoStringRedacted("Errors",
        request_state->coord()->GetErrorLog());

    const PerBackendExecParams& per_backend_params =
--- a/be/src/util/runtime-profile.cc
+++ b/be/src/util/runtime-profile.cc
@@ -466,14 +466,17 @@ void RuntimeProfile::AddInfoString(const string& key, const string& value) {
  return AddInfoStringInternal(key, value, false);
 }

+void RuntimeProfile::AddInfoStringRedacted(const string& key, const string& value) {
+  return AddInfoStringInternal(key, value, false, true);
+}
+
 void RuntimeProfile::AppendInfoString(const string& key, const string& value) {
  return AddInfoStringInternal(key, value, true);
 }

 void RuntimeProfile::AddInfoStringInternal(
-    const string& key, const string& value, bool append) {
-  // Values may contain sensitive data, such as a query.
-  const string& info = RedactCopy(value);
+    const string& key, const string& value, bool append, bool redact) {
+  const string& info = redact ? RedactCopy(value): value;
  lock_guard<SpinLock> l(info_strings_lock_);
  InfoStrings::iterator it = info_strings_.find(key);
  if (it == info_strings_.end()) {
--- a/be/src/util/runtime-profile.h
+++ b/be/src/util/runtime-profile.h
@@ -209,6 +209,10 @@ class RuntimeProfile { // NOLINT: This struct is not packed, but there are not s
  /// the value will be updated.
  void AddInfoString(const std::string& key, const std::string& value);

+  /// Same as AddInfoString(), except that this method applies the redaction
+  /// rules on 'value' before adding it to the runtime profile.
+  void AddInfoStringRedacted(const std::string& key, const std::string& value);
+
  /// Adds a string to the runtime profile.  If a value already exists for 'key',
  /// 'value' will be appended to the previous value, with ", " separating them.
  void AppendInfoString(const std::string& key, const std::string& value);
@@ -473,8 +477,9 @@ class RuntimeProfile { // NOLINT: This struct is not packed, but there are not s

  /// Implementation of AddInfoString() and AppendInfoString(). If 'append' is false,
  /// implements AddInfoString(), otherwise implements AppendInfoString().
+  /// Redaction rules are applied on the info string if 'redact' is true.
  void AddInfoStringInternal(
-      const std::string& key, const std::string& value, bool append);
+      const std::string& key, const std::string& value, bool append, bool redact = false);

  /// Name of the counter maintaining the total time.
  static const std::string TOTAL_TIME_COUNTER_NAME;
--- a/tests/custom_cluster/test_redaction.py
+++ b/tests/custom_cluster/test_redaction.py
@@ -15,6 +15,7 @@
 # specific language governing permissions and limitations
 # under the License.

+import getpass
 import logging
 import os
 import pytest
@@ -160,6 +161,14 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
      assert results, "Web page %s should contain '%s' but does not" \
          % (url, search)

+  def assert_query_profile_contains(self, query_id, search):
+    ''' Asserts that the query profile for 'query_id' contains 'search' string'''
+    impala_service = self.create_impala_service()
+    url = 'query_profile?query_id=%s' % query_id
+    results = self.grep_file(impala_service.open_debug_webpage(url), search)
+    assert results, "Query profile %s should contain '%s' but does not" \
+        % (url, search)
+
  def assert_file_in_dir_contains(self, dir, search):
    '''Asserts that at least one file in the 'dir' contains the 'search' term.'''
    results = self.grep_dir(dir,search)
@@ -271,6 +280,7 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
    '''Check that redaction rules prevent 'sensitive' data from leaking into the
       logs and web ui.
    '''
+    current_user = getpass.getuser()
    self.start_cluster_using_rules(r"""
      {
        "version": 1,
@@ -285,17 +295,26 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
            "description": "Don't show credit cards numbers",
            "search": "\\d{4}-\\d{4}-\\d{4}-\\d{4}",
            "replace": "*credit card*"
+          },
+          {
+            "description": "Don't show current username in queries",
+            "search": "%s",
+            "replace": "redacted user"
          }
        ]
-      }""")
+      }""" % current_user)
    email = 'FOO@bar.com'
    # GROUP BY an expr containing the email so the expr will also be shown in the exec
    # node summary, ie HASH(string_col = ...).
-    self.execute_query_expect_success(self.client,
-        "SELECT string_col = '%s', COUNT(*) FROM functional.alltypes GROUP BY 1" % email)
+    query_template =\
+        "SELECT string_col = '%s', COUNT(*) FROM functional.alltypes GROUP BY 1"
+    self.execute_query_expect_success(self.client, query_template % email)

+    user_profile_pattern = "User: %s" % current_user
    # The email should be replaced with '*email*'.
    self.assert_web_ui_redaction(self.find_last_query_id(), email, "*email*")
+    # User field should not be redacted from the query profile.
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
    # Wait for the logs to be written.
    sleep(5)
    self.assert_log_redaction(email, "*email*")
@@ -306,10 +325,17 @@ class TestRedaction(CustomClusterTestSuite, unittest.TestCase):
    # This assertion below relies on the fact that there is a syntax error be near
    # the credit card number so the number would have appeared in the message.
    self.assert_web_ui_redaction(self.find_last_query_id(), credit_card, "*credit card*")
+    # User field should not be redacted from the query profile.
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
    sleep(5)
    # Apparently an invalid query doesn't generate an audit log entry.
    self.assert_log_redaction(credit_card, "*credit card*", expect_audit=False)

+    # Assert that the username in the query stmt is redacted but not from the user fields.
+    self.execute_query_expect_success(self.client, query_template % current_user)
+    self.assert_query_profile_contains(self.find_last_query_id(), user_profile_pattern)
+    self.assert_query_profile_contains(self.find_last_query_id(), "redacted user")
+
    # Since all the tests passed, the log dir shouldn't be of interest and can be
    # deleted.
    shutil.rmtree(self.tmp_dir)