IMPALA-13305: Better thrift compatibility checks based on pyparsing

There are some false positive warnings reported by critique-gerrit-review.py when adding a new thrift struct that has required fields. This patch leverages pyparsing to analyze the thrift file changes. So we can identify whether the new required field is added in an existing struct. thrift_parser.py adds a simple thrift grammar parser to parse a thrift file into an AST. It basically consists of pyparsing.ParseResults and some customized classes to inject the line number, i.e. thrift_parser.ThriftField and thrift_parser.ThriftEnumItem. Import thrift_parser to parse the current version of a thrift file and the old version of it before the commit. critique-gerrit-review.py then compares the structs and enums to report these warnings: - A required field is deleted in an existing struct. - A new required field is added in an existing struct. - An existing field is renamed. - The qualifier (required/optional) of a field is changed. - The type of a field is changed. - An enum item is removed. - Enum items are reordered. Only thrift files used in both catalogd and impalad are checked. This is the same as the current version. We can further improve this by analyzing all RPCs used between impalad and catalogd to get all thrift struct/enums used in them. Warning examples for commit e48af8c04: "common/thrift/StatestoreService.thrift": [ { "message": "Renaming field 'sequence' to 'catalogd_version' in TUpdateCatalogdRequest might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 345, "side": "REVISION" } ] Warning examples for commit 595212b4e: "common/thrift/CatalogObjects.thrift": [ { "message": "Adding a required field 'type' in TIcebergPartitionField might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 612, "side": "REVISION" } ] Warning examples for commit c57921225: "common/thrift/CatalogObjects.thrift": [ { "message": "Renaming field 'partition_id' to 'spec_id' in TIcebergPartitionSpec might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 606, "side": "REVISION" } ], "common/thrift/CatalogService.thrift": [ { "message": "Changing field 'iceberg_data_files_fb' from required to optional in TIcebergOperationParam might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 215, "side": "REVISION" }, { "message": "Adding a required field 'operation' in TIcebergOperationParam might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 209, "side": "REVISION" } ], "common/thrift/Query.thrift": [ { "message": "Renaming field 'spec_id' to 'iceberg_params' in TFinalizeParams might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 876, "side": "REVISION" } ] Warning example for commit 2b2cf8d96: "common/thrift/CatalogService.thrift": [ { "message": "Enum item FUNCTION_NOT_FOUND=3 changed to TABLE_NOT_LOADED=3 in CatalogLookupStatus. This might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 381, "side": "REVISION" } ] Warning example for commit c01efd096: "common/thrift/JniCatalog.thrift": [ { "message": "Removing the enum item TAlterTableType.SET_OWNER=15 might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 107, "side": "PARENT" } ] Warning example for commit 374783c55: "common/thrift/Query.thrift": [ { "message": "Changing type of field 'enabled_runtime_filter_types' from PlanNodes.TEnabledRuntimeFilterTypes to set<PlanNodes.TRuntimeFilterType> in TQueryOptions might break the compatibility between impalad and catalogd/statestore during upgrade", "line": 449, "side": "REVISION" } Tests - Add tests in tests/infra/test_thrift_parser.py - Verified the script with all(1260) commits of common/thrift. Change-Id: Ia1dc4112404d0e7c5df94ee9f59a4fe2084b360d Reviewed-on: http://gerrit.cloudera.org:8080/22264 Reviewed-by: Riza Suminto <riza.suminto@cloudera.com> Reviewed-by: Michael Smith <michael.smith@cloudera.com> Tested-by: Impala Public Jenkins <impala-public-jenkins@cloudera.com>
2025-12-19 18:12:08 -05:00 · 2024-12-27 16:41:11 +08:00
parent ce6be49c08
commit 777ae104bb
4 changed files with 403 additions and 95 deletions
--- a/bin/jenkins/critique-gerrit-review.py
+++ b/bin/jenkins/critique-gerrit-review.py
@@ -44,10 +44,13 @@ from os import environ
 import os.path
 import re
 from subprocess import check_call, check_output, DEVNULL, Popen, PIPE
+import sys
 import venv

+
 FLAKE8_VERSION = "7.1.1"
 FLAKE8_DIFF_VERSION = "0.2.2"
+PYPARSING_VERSION = "3.1.4"

 VENV_PATH = "gerrit_critic_venv"
 VENV_BIN = os.path.join(VENV_PATH, "bin")
@@ -126,8 +129,12 @@ def setup_virtualenv():
  venv.create(VENV_PATH, with_pip=True, system_site_packages=True)
  check_call([PIP_PATH, "install",
              "wheel",
-              "flake8=={0}".format(FLAKE8_VERSION),
-              "flake8-diff=={0}".format(FLAKE8_DIFF_VERSION)])
+              f"flake8=={FLAKE8_VERSION}",
+              f"flake8-diff=={FLAKE8_DIFF_VERSION}",
+              f"pyparsing=={PYPARSING_VERSION}"])
+  # Add the libpath of the installed venv to import pyparsing
+  sys.path.append(os.path.join(VENV_PATH, f"lib/python{sys.version_info.major}."
+                                          f"{sys.version_info.minor}/site-packages/"))


 def get_comment_input(message, line_number=0, side=COMMENT_REVISION_SIDE,
@@ -253,106 +260,110 @@ def add_misc_comments_for_line(comments, line, curr_file, curr_line_num, dryrun=
        curr_line_num, context_line=line, dryrun=dryrun))


+def compare_thrift_structs(curr_file, old_structs, curr_structs, comments):
+  # Skip if the file is removed or new
+  if old_structs is None or curr_structs is None:
+    return
+  # Adding new structs is OK. Compare fields of existing structs.
+  for struct_name, old_fields in old_structs.items():
+    if struct_name not in curr_structs:
+      print(f"Removed struct {struct_name}")
+      continue
+    curr_fields = curr_structs[struct_name]
+    for fid, old_field in old_fields.items():
+      if fid not in curr_fields:
+        if old_field.qualifier == 'required':
+          comments[curr_file].append(get_comment_input(
+            f"Deleting required field '{old_field.name}' of {struct_name}"
+            + THRIFT_WARNING_SUFFIX,
+            old_field.line_num, COMMENT_PARENT_SIDE))
+        continue
+      curr_field = curr_fields.pop(fid)
+      if curr_field.name != old_field.name:
+        comments[curr_file].append(get_comment_input(
+          f"Renaming field '{old_field.name}' to '{curr_field.name}' in {struct_name}"
+          + THRIFT_WARNING_SUFFIX,
+          curr_field.line_num))
+        continue
+      if curr_field.qualifier != old_field.qualifier:
+        comments[curr_file].append(get_comment_input(
+          f"Changing field '{old_field.name}' from {old_field.qualifier} to "
+          f"{curr_field.qualifier} in {struct_name}" + THRIFT_WARNING_SUFFIX,
+          curr_field.line_num))
+        continue
+      if curr_field.type != old_field.type:
+        comments[curr_file].append(get_comment_input(
+          f"Changing type of field '{old_field.name}' from {old_field.type} to "
+          f"{curr_field.type} in {struct_name}" + THRIFT_WARNING_SUFFIX,
+          curr_field.line_num))
+    if len(curr_fields) > 0:
+      for new_field in curr_fields.values():
+        if new_field.qualifier == 'required':
+          comments[curr_file].append(get_comment_input(
+            f"Adding a required field '{new_field.name}' in {struct_name}"
+            + THRIFT_WARNING_SUFFIX,
+            new_field.line_num))
+  new_struct_names = curr_structs.keys() - old_structs.keys()
+  if len(new_struct_names) > 0:
+    print(f"New structs {new_struct_names}")
+
+
+def compare_thrift_enums(curr_file, old_enums, curr_enums, comments):
+  # Skip if the file is removed or new
+  if old_enums is None or curr_enums is None:
+    return
+  for enum_name, old_items in old_enums.items():
+    if enum_name not in curr_enums:
+      print(f"Removed enum {enum_name}")
+      continue
+    curr_items = curr_enums[enum_name]
+    for value, old_item in old_items.items():
+      if value not in curr_items:
+        comments[curr_file].append(get_comment_input(
+          f"Removing the enum item {enum_name}.{old_item.name}={value}"
+          + THRIFT_WARNING_SUFFIX,
+          old_item.line_num, COMMENT_PARENT_SIDE))
+        continue
+      if curr_items[value].name != old_item.name:
+        comments[curr_file].append(get_comment_input(
+          f"Enum item {old_item.name}={value} changed to "
+          f"{curr_items[value].name}={value} in {enum_name}. This"
+          + THRIFT_WARNING_SUFFIX,
+          curr_items[value].line_num))
+
+
+def extract_thrift_defs_of_revision(revision, file_name):
+  """Extract a dict of thrift structs from pyparsing.ParseResults"""
+  # Importing thrift_parser depends on pyparsing being installed in setup_virtualenv().
+  from thrift_parser import extract_thrift_defs
+  try:
+    contents = check_output(["git", "show", f"{revision}:{file_name}"],
+                            universal_newlines=True)
+  except Exception as e:
+    # Usually it's due to file doesn't exist in that revision
+    print(f"Failed to read {file_name} of revision {revision}: {e}")
+    return None, None
+  return extract_thrift_defs(contents)
+
+
 def get_catalog_compatibility_comments(base_revision, revision, dryrun=False):
  """Get comments on Thrift/FlatBuffers changes that might break the communication
  between impalad and catalogd/statestore"""
  comments = defaultdict(lambda: [])

  diff = check_output(
-      ["git", "diff", "-U0", "{0}..{1}".format(base_revision, revision),
-       "--", "common/thrift"],
+      ["git", "diff", "--name-only", "{0}..{1}".format(base_revision, revision),
+       "--", "common/thrift/*.thrift"],
      universal_newlines=True)
-  curr_file = None
-  check_source_file = False
-  has_concerns = False
-  in_enum_clause = False
-  is_thrift_file = False
-  # Line numbers in the old file and in the new file
-  old_line_num = 0
-  new_line_num = 0
-  for diff_line in diff.splitlines():
-    if diff_line.startswith("--- "):
+  for curr_file in diff.splitlines():
+    if os.path.basename(curr_file) in EXCLUDE_THRIFT_FILES:
      continue
-    elif diff_line.startswith("+++ "):
-      # Start of diff for a file. Add a comment for the previous file if has concerns.
-      if curr_file and check_source_file and has_concerns:
-        comments[curr_file].append(get_comment_input(THRIFT_FILE_COMMENT))
-      # Strip off "+++ b/" to get the file path.
-      curr_file = diff_line[6:]
-      check_source_file = False
-      has_concerns = False
-      in_enum_clause = False
-      is_thrift_file = os.path.splitext(curr_file)[1] == ".thrift"
-      if is_thrift_file and os.path.basename(curr_file) not in EXCLUDE_THRIFT_FILES:
-        check_source_file = True
-    elif check_source_file and diff_line.startswith("@@ "):
-      # Figure out the starting line of the hunk. Examples:
-      #  @@ -932,0 +933,5 @@ enum TImpalaQueryOptions {
-      #  @@ -55,0 +56 @@ enum TPlanNodeType {
-      #  @@ -109 +109 @@ struct THdfsTableSink {
-      # We want to extract the start line for the added lines
-      match = RANGE_RE.match(diff_line)
-      if not match:
-        raise Exception("Pattern did not match diff line:\n{0}".format(diff_line))
-      old_line_num = int(match.group(1))
-      new_line_num = int(match.group(2))
-      in_enum_clause = match.group(3).startswith("enum ")
-    elif check_source_file and diff_line.startswith("-"):
-      # Check if deleting/modifying a required field
-      change = diff_line[1:].strip()
-      if in_enum_clause and not change.startswith("//"):
-        has_concerns = True
-        comments[curr_file].append(get_comment_input(
-          "Modifying/deleting this enum item" + THRIFT_WARNING_SUFFIX,
-          old_line_num, COMMENT_PARENT_SIDE, diff_line, dryrun))
-      elif REQUIRED_FIELD_RE.match(change):
-        has_concerns = True
-        comments[curr_file].append(get_comment_input(
-          "Modifying/deleting this required field" + THRIFT_WARNING_SUFFIX,
-          old_line_num, COMMENT_PARENT_SIDE, diff_line, dryrun))
-      elif OPTIONAL_FIELD_RE.match(change):
-        # Removing an optional field should be OK unless the field number is reused by
-        # a new optional field. Add a general comment on the file.
-        has_concerns = True
-      old_line_num += 1
-    elif is_thrift_file and diff_line.startswith("+"):
-      change = diff_line[1:].strip()
-      # Check includes in Thrift files
-      match = INCLUDE_FILE_RE.match(change)
-      # Case 1: a target file includes a whitelist file, E.g. Frontend.thrift includes
-      # LineageGraph.thrift. Future changes in LineageGraph.thrift might impact
-      # Frontend.thrift so
-      #  - LineageGraph.thrift should be removed from the whitelist (i.e.
-      #    EXCLUDE_THRIFT_FILES) if it will be used in impalad and catalogd.
-      #  - Or developers should make sure the included new fields are read/write only
-      #    in impalads or only in catalogd.
-      if match and check_source_file and match.group(1) in EXCLUDE_THRIFT_FILES:
-        comments[curr_file].append(get_comment_input(
-            "Future changes in {0} might break the compatibility between impalad and "
-            "catalogd/statestore. Please remove {0} from EXCLUDE_THRIFT_FILES in "
-            "bin/jenkins/critique-gerrit-review.py or make sure the new fields are "
-            "read/write only in impalads or only in catalogd".format(match.group(1)),
-            new_line_num, context_line=diff_line, dryrun=dryrun))
-      # Case 2: A whitelist file includes a target file, e.g. PlanNodes.thrift includes
-      # Data.thrift. Note that PlanNodes.thrift is supposed to be used inside impalad.
-      # Data.thrift is used in both impalad and catalogd. We should ensure new fields
-      # included from Data.thrift are not set from catalogd.
-      elif (match and not check_source_file
-          and match.group(1) not in EXCLUDE_THRIFT_FILES):
-        comments[curr_file].append(get_comment_input(
-            "Thrift objects in the current file are supposed to be used inside "
-            "impalads. Please make sure new fields includes from {} are not set by "
-            "catalogd.".format(match.group(1)),
-            new_line_num, context_line=diff_line, dryrun=dryrun))
-      elif check_source_file:
-        if REQUIRED_FIELD_RE.match(change):
-          has_concerns = True
-          comments[curr_file].append(get_comment_input(
-              "Modifying/adding this required field" + THRIFT_WARNING_SUFFIX,
-              new_line_num, context_line=diff_line, dryrun=dryrun))
-      new_line_num += 1
-  if curr_file and check_source_file and has_concerns:
-    comments[curr_file].append(get_comment_input(THRIFT_FILE_COMMENT))
+    print(f"Parsing {curr_file}")
+    curr_structs, curr_enums = extract_thrift_defs_of_revision(revision, curr_file)
+    old_structs, old_enums = extract_thrift_defs_of_revision(base_revision, curr_file)
+    compare_thrift_structs(curr_file, old_structs, curr_structs, comments)
+    compare_thrift_enums(curr_file, old_enums, curr_enums, comments)
+
  merge_comments(
      comments, get_flatbuffers_compatibility_comments(base_revision, revision))
  return comments
--- a/bin/jenkins/thrift_parser.py
+++ b/bin/jenkins/thrift_parser.py
@@ -0,0 +1,259 @@
+#!/usr/bin/python3
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+# Developed based on example at:
+# https://github.com/pyparsing/pyparsing/blob/c4cf4a5/examples/protobuf_parser.py
+#
+# TODO: Consider using thriftpy2.parser. Need it to expose lineno.
+#
+
+from collections import defaultdict
+import os
+import sys
+from pyparsing import (
+  Forward,
+  Group,
+  Keyword,
+  OneOrMore,
+  Optional,
+  ParseBaseException,
+  ParseResults,
+  Suppress,
+  Word,
+  ZeroOrMore,
+  alphanums,
+  alphas,
+  cppStyleComment,
+  delimitedList,
+  line,
+  lineno,
+  oneOf,
+  pyparsing_common,
+  pythonStyleComment,
+  quotedString,
+  restOfLine,
+)
+
+# Simply treating strings like 'CatalogServiceVersion.V2' as an identifier by allowing
+# '.' in the bodyChars
+ident = Word(alphas + "_", bodyChars=alphanums + "_.").setName("identifier")
+integer = pyparsing_common.integer
+
+LBRACE, RBRACE, LBRACK, RBRACK, LPAR, RPAR, EQ, SEMI, COMMA, COLON, LT, GT =\
+    map(Suppress, "{}[]()=;,:<>")
+
+optionalCommaOrSemi = Optional(COMMA | SEMI)
+
+# Ref: https://github.com/apache/thrift/blob/154d154/compiler/cpp/src/thrift/thriftl.ll
+TYPEDEF_ = Keyword('typedef')
+NAMESPACE_ = Keyword('namespace')
+EXCEPTION_ = Keyword('exception')
+STRUCT_ = Keyword('struct')
+UNION_ = Keyword('union')
+REQUIRED_ = Keyword('required')
+OPTIONAL_ = Keyword('optional')
+ENUM_ = Keyword('enum')
+EXTENDS_ = Keyword('extends')
+SERVICE_ = Keyword('service')
+TRUE_ = Keyword('true')
+FALSE_ = Keyword('false')
+THROWS_ = Keyword('throws')
+LIST_ = Keyword('list')
+MAP_ = Keyword('map')
+SET_ = Keyword('set')
+VOID_ = Keyword('void')
+CONST_ = Keyword('const')
+
+typespec = Forward()
+# Scalar types and customized types
+simple_types = oneOf("void bool byte i8 i16 i32 i64 double string binary") | ident
+list_type = (LIST_ | SET_) + LT + Group(typespec) + GT
+map_type = MAP_ + LT + Group(typespec + COMMA + typespec) + GT
+typespec << (list_type | map_type | simple_types)
+typespec.setName("typespec")
+
+list_value = LBRACK + ZeroOrMore(delimitedList(ident)) + RBRACK
+rvalue = pyparsing_common.fnumber | TRUE_ | FALSE_ | ident | quotedString | list_value
+
+
+class ThriftField:
+  def __init__(self, line_num, field_id, qualifier, ts, name, default_value):
+    self.id = field_id
+    self.qualifier = qualifier
+    self.type = ts
+    self.name = name
+    self.default_value = default_value
+    self.line_num = line_num
+
+  def __repr__(self):
+    return f"Field{self.id} {self.qualifier} {self.type} {self.name} L{self.line_num}"
+
+
+def create_thrift_field(src_string, locn, toks):
+  # 'qualifier' and 'default_value' are optional. Use getattr() to set them as None
+  # when they are missing
+  qualifier = getattr(toks, "qualifier")
+  default_value = getattr(toks, "defaultValue")
+  return ThriftField(lineno(locn, src_string), toks["id"], qualifier, toks["ts"],
+                     toks["name"], default_value)
+
+
+def gen_type_string(toks):
+  """Generate type strings from the nested tokens, e.g.
+  [['set', ['PlanNodes.TRuntimeFilterType']]] -> 'set<PlanNodes.TRuntimeFilterType>',
+  [['map', ['int', 'string']]] -> 'map<int,string>'.
+  """
+  if not isinstance(toks, ParseResults):
+    return str(toks)
+  if len(toks) == 1:
+    # Somehow the first level of 'toks' has two nested levels. E.g. for list<string>,
+    # 'toks' is ParseResults([ParseResults(['list', ParseResults(['string'])])]).
+    return gen_type_string(toks[0])
+  assert len(toks) == 2
+  if toks[0] == 'set' or toks[0] == 'list':
+    return f"{toks[0]}<{gen_type_string(toks[1])}>"
+  assert toks[0] == 'map'
+  return f"map<{gen_type_string(toks[1][0])},{gen_type_string(toks[1][1])}>"
+
+
+fieldDefn = (
+    integer("id")
+    + COLON
+    + Optional(REQUIRED_ | OPTIONAL_, "optional")("qualifier")
+    + Group(typespec)("ts").setParseAction(gen_type_string)
+    + ident("name")
+    + Optional(EQ + rvalue("defaultValue"))
+    + optionalCommaOrSemi
+).setParseAction(create_thrift_field)
+
+fieldsDefBody = Group(ZeroOrMore(fieldDefn))("body")
+
+structDefn = STRUCT_ - ident("name") + LBRACE + fieldsDefBody + RBRACE
+
+unionDefn = UNION_ - ident("name") + LBRACE + fieldsDefBody + RBRACE
+
+
+class ThriftEnumItem:
+  def __init__(self, name, value, line_num):
+    self.name = name
+    self.value = value
+    self.line_num = line_num
+
+  def __repr__(self):
+    return f"EnumItem {self.name} L{self.line_num}"
+
+
+def create_thrift_enum_item(src_string, locn, toks):
+  # toks[0] is in the form of [name] or [name, value]
+  value = toks[0][1] if len(toks[0]) > 1 else None
+  return ThriftEnumItem(toks[0][0], value, lineno(locn, src_string))
+
+
+enumDefn = (
+    ENUM_ - ident("name") + LBRACE
+    + OneOrMore(
+        Group(ident + Optional(EQ + integer)).setParseAction(create_thrift_enum_item)
+        + optionalCommaOrSemi
+    )("evalues")
+    + RBRACE
+)
+
+exceptionsDefn = LPAR + OneOrMore(fieldDefn) + RPAR
+
+methodDefn = (
+    typespec("returnType")
+    - ident("methodName")
+    + LPAR
+    + fieldsDefBody
+    + RPAR
+    + Optional(THROWS_ + Group(exceptionsDefn))
+    + optionalCommaOrSemi
+)
+
+serviceDefn = (
+    SERVICE_ - ident("serviceName") + Optional(EXTENDS_ + ident)
+    + LBRACE + ZeroOrMore(Group(methodDefn)) + RBRACE
+)
+typeDefn = TYPEDEF_ - typespec("typespec") + ident("ident")
+
+topLevelStatement = Group(
+  structDefn("struct")
+  | unionDefn("union")
+  | enumDefn("enum")
+  | serviceDefn("service")
+)
+
+thrift_parser = ZeroOrMore(topLevelStatement)
+
+thrift_parser.ignore(cppStyleComment)
+thrift_parser.ignore(pythonStyleComment)
+# TODO: Add syntax for const variables. Currently they are all defined in a single line
+#  so we can ignore them like this.
+thrift_parser.ignore("const " + restOfLine)
+thrift_parser.ignore("namespace " + restOfLine)
+thrift_parser.ignore("include " + restOfLine)
+
+
+def extract_thrift_defs(thrift_file_content):
+  """Extract a dict of thrift structs from pyparsing.ParseResults"""
+  parse_results = thrift_parser.parseString(thrift_file_content)
+  structs = defaultdict()
+  enums = defaultdict()
+  for top_level_item in parse_results:
+    if top_level_item.getName() == 'struct':
+      # A dict from field id to thrift_parser.ThriftField
+      struct_fields = defaultdict()
+      for field in top_level_item.body:
+        struct_fields[field.id] = field
+      structs[top_level_item.name] = struct_fields
+    elif top_level_item.getName() == 'enum':
+      # A dict from enum value (i32) to thrift_parser.ThriftEnumItem
+      enum_items = defaultdict()
+      index = 0
+      for item in top_level_item.evalues:
+        if item.value is not None:
+          enum_items[item.value] = item
+        else:
+          # Deal with enum items without value(index), e.g. items of TQueryTableColumn
+          enum_items[index] = item
+          index += 1
+      enums[top_level_item.name] = enum_items
+  return structs, enums
+
+
+if __name__ == '__main__':
+  # thrift_dir = os.environ["IMPALA_HOME"] + "/common/thrift"
+  thrift_dir = sys.argv[1]
+  for entry in os.scandir(thrift_dir):
+    if entry.is_file() and entry.name.endswith(".thrift"):
+      print("parsing " + entry.path)
+      with open(entry.path, 'r') as in_file:
+        contents = in_file.read()
+        try:
+          structs, enums = extract_thrift_defs(contents)
+          for struct_name, struct_fields in structs.items():
+            print(f"struct {struct_name}")
+            for field in struct_fields.values():
+              print(field)
+          for enum_name, enum_items in enums.items():
+            print(f"enum {enum_name}")
+            for item in enum_items.values():
+              print(item)
+        except ParseBaseException as e:
+          print("Failed to parse line:\n" + line(e.loc, contents))
+          raise e
--- a/infra/python/deps/requirements.txt
+++ b/infra/python/deps/requirements.txt
@@ -39,7 +39,7 @@ pg8000 == 1.10.2
 prettytable == 0.7.2
 prometheus-client == 0.12.0
 psutil == 5.6.3
-pyparsing == 2.0.3
+pyparsing == 2.4.7
 pytest == 2.9.2
  py == 1.4.32
  pytest-forked == 0.2
--- a/tests/infra/test_thrift_parser.py
+++ b/tests/infra/test_thrift_parser.py
@@ -0,0 +1,38 @@
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements.  See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership.  The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License.  You may obtain a copy of the License at
+#
+#   http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied.  See the License for the
+# specific language governing permissions and limitations
+# under the License.
+
+from __future__ import absolute_import
+import os
+from subprocess import check_output
+
+
+def test_thrift_parser():
+  """Make sure thrift_parser can parse all the thrift files"""
+  impala_home = os.getenv("IMPALA_HOME")
+  # Only supports python3
+  output = check_output([os.path.join(impala_home, "bin/impala-python3"),
+                         os.path.join(impala_home, "bin/jenkins/thrift_parser.py"),
+                         os.path.join(impala_home, "common/thrift")],
+                        universal_newlines=True)
+  assert "TStatus" in output
+  assert "Field1 required ErrorCodes.TErrorCode status_code" in output
+  assert "Field2 optional list<string> error_msgs" in output
+  assert "TErrorCode" in output
+  assert "EnumItem OK" in output
+  assert "EnumItem UNUSED" in output
+  assert "EnumItem GENERAL" in output
+  assert "EnumItem CANCELLED" in output