mirror of
https://github.com/apache/impala.git
synced 2025-12-19 09:58:28 -05:00
To remove the dependency on Python 2, existing scripts need to use python3 rather than python. These commands find those locations (for impala-python and regular python): git grep impala-python | grep -v impala-python3 | grep -v impala-python-common | grep -v init-impala-python git grep bin/python | grep -v python3 This removes or switches most of these locations by various means: 1. If a python file has a #!/bin/env impala-python (or python) but doesn't have a main function, it removes the hash-bang and makes sure that the file is not executable. 2. Most scripts can simply switch from impala-python to impala-python3 (or python to python3) with minimal changes. 3. The cm-api pypi package (which doesn't support Python 3) has been replaced by the cm-client pypi package and interfaces have changed. Rather than migrating the code (which hasn't been used in years), this deletes the old code and stops installing cm-api into the virtualenv. The code can be restored and revamped if there is any interest in interacting with CM clusters. 4. This switches tests/comparison over to impala-python3, but this code has bit-rotted. Some pieces can be run manually, but it can't be fully verified with Python 3. It shouldn't hold back the migration on its own. 5. This also replaces locations of impala-python in comments / documentation / READMEs. 6. kazoo (used for interacting with HBase) needed to be upgraded to a version that supports Python 3. The newest version of kazoo requires upgrades of other component versions, so this uses kazoo 2.8.0 to avoid needing other upgrades. The two remaining uses of impala-python are: - bin/cmake_aux/create_virtualenv.sh - bin/impala-env-versioned-python These will be removed separately when we drop Python 2 support completely. In particular, these are useful for testing impala-shell with Python 2 until we stop supporting Python 2 for impala-shell. The docker-based tests still use /usr/bin/python, but this can be switched over independently (and doesn't impact impala-python) Testing: - Ran core job - Ran build + dataload on Centos 7, Redhat 8 - Manual testing of individual scripts (except some bitrotted areas like the random query generator) Change-Id: If209b761290bc7e7c716c312ea757da3e3bca6dc Reviewed-on: http://gerrit.cloudera.org:8080/23468 Reviewed-by: Michael Smith <michael.smith@cloudera.com> Tested-by: Michael Smith <michael.smith@cloudera.com>
293 lines
12 KiB
Python
Executable File
293 lines
12 KiB
Python
Executable File
#!/usr/bin/env python3
|
|
#
|
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
|
# you may not use this file except in compliance with the License.
|
|
# You may obtain a copy of the License at
|
|
#
|
|
# http://www.apache.org/licenses/LICENSE-2.0
|
|
#
|
|
# Unless required by applicable law or agreed to in writing, software
|
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
# See the License for the specific language governing permissions and
|
|
# limitations under the License.
|
|
|
|
# Future imports must happen at the beginning of the file
|
|
from __future__ import absolute_import, division, print_function
|
|
|
|
HELP = '''
|
|
Compares two specified branches, using the Gerrit Change-Id as the
|
|
primary identifier. Ignored commits can be added via a JSON
|
|
configuration file or with a special string in the commit message.
|
|
Changes can be cherrypicked with the --cherry_pick argument.
|
|
|
|
This script can be used to keep two development branches
|
|
(by default, "master" and "2.x", in sync). It is equivalent
|
|
to cherry-picking commits one by one, but automates identifying
|
|
the commits to cherry-pick. Unlike "git cherry", it uses
|
|
the Gerrit Change-Id identifier in the commit message
|
|
as a key.
|
|
|
|
The ignored_commits.json configuration file is of the following
|
|
form. Note that commits are the full 20-byte git hashes.
|
|
|
|
[
|
|
{
|
|
"source": "master",
|
|
"target": "2.x",
|
|
"commits": [
|
|
{ "hash": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "comment": "..."},
|
|
{ "hash": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", "comment": "..."}
|
|
]
|
|
}
|
|
]
|
|
|
|
The --target_remote_name is optional. If not specified, the target remote is set to
|
|
the value of the --source_remote_name. Debug logging to stderr can be enabled with
|
|
--verbose.
|
|
|
|
Example:
|
|
|
|
$bin/compare_branches.py --source_branch master --target_branch 2.x
|
|
--------------------------------------------------------------------------------
|
|
Commits in asf-gerrit/master but not in asf-gerrit/2.x:
|
|
--------------------------------------------------------------------------------
|
|
35a3e186d61b8f365b0f7d1127be311758437e16 IMPALA-5478: Run TPCDS queries with decimal_v2 enabled (Thu Jan 18 03:28:51 2018 +0000) - Taras Bobrovytsky
|
|
d9b6fd073055b436c7404d49454dc215b2c7a369 IMPALA-6386: Invalidate metadata at table level for dataload (Wed Jan 17 22:52:58 2018 +0000) - Joe McDonnell
|
|
dcc7be0ed483b332dac22d6596f56ff2a6cfdaa3 IMPALA-4315: Allow USE and SHOW TABLES if the user has only column privileges (Wed Jan 17 22:40:13 2018 +0000) - Csaba Ringhofer
|
|
b6e43133e671773d2757612f72cfcdb0ff303226 IMPALA-6399: Increase timeout in test_observability to reduce flakiness (Wed Jan 17 22:31:33 2018 +0000) - Lars Volker
|
|
--------------------------------------------------------------------------------
|
|
Jira keys referenced (Note: not all commit messages will reference a jira key):
|
|
IMPALA-5478,IMPALA-6386,IMPALA-4315,IMPALA-6399
|
|
--------------------------------------------------------------------------------
|
|
'''
|
|
|
|
import argparse
|
|
import json
|
|
import logging
|
|
import os
|
|
import re
|
|
import subprocess
|
|
import sys
|
|
|
|
from collections import defaultdict
|
|
from collections import OrderedDict
|
|
from pprint import pformat
|
|
|
|
def create_parser():
|
|
class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter,
|
|
argparse.RawDescriptionHelpFormatter):
|
|
"""
|
|
Mix-in to leave the description alone, but show
|
|
defaults.
|
|
"""
|
|
pass
|
|
|
|
parser = argparse.ArgumentParser(
|
|
formatter_class=CustomFormatter,
|
|
description=HELP)
|
|
|
|
parser.add_argument('--cherry_pick', action='store_true', default=False,
|
|
help='Cherry-pick mismatched commits to current branch. This ' +
|
|
'must match (in the hash sense) the target branch.')
|
|
parser.add_argument('--partial_ok', action='store_true', default=False,
|
|
help='Exit with success if at least one cherrypick succeeded.')
|
|
parser.add_argument('--source_branch', default='master')
|
|
parser.add_argument('--target_branch', default='2.x')
|
|
parser.add_argument('--source_remote_name', default='asf-gerrit',
|
|
help='Name of the source git remote. If set to empty string, ' +
|
|
'this remote is not fetched and branch names are used ' +
|
|
' as is; otherwise, the source ref is remote/branch.')
|
|
parser.add_argument('--target_remote_name', default=None,
|
|
help='Name of the target git remote; defaults to source remote. ' +
|
|
'Empty strings are handled the same way as --source_remote_name.')
|
|
default_ignored_commits_path = os.path.join(
|
|
os.path.dirname(os.path.abspath(__file__)), 'ignored_commits.json')
|
|
parser.add_argument('--ignored_commits_file', default=default_ignored_commits_path,
|
|
help='JSON File that contains ignored commits as specified in the help')
|
|
parser.add_argument('--skip_commits_matching',
|
|
default="Cherry-pick.?:.?not (for|to) {branch}",
|
|
help='Regex searched for in commit messages that causes the commit to be ignored.' +
|
|
' {branch} is replaced with target branch; the search is case-insensitive')
|
|
parser.add_argument('--verbose', '-v', action='store_true', default=False,
|
|
help='Turn on DEBUG and INFO logging')
|
|
return parser
|
|
|
|
def read_ignored_commits(ignored_commits_file):
|
|
'''Returns a dictionary containing commits that should be ignored.
|
|
|
|
ignored_commits_file is a path to a JSON file with schema
|
|
specified at the top of this file.
|
|
|
|
The return structure has dictionary keys are a tuple containing
|
|
(source_branch, target_branch) and values are a set of git hashes.
|
|
'''
|
|
ignored_commits = defaultdict(set)
|
|
with open(ignored_commits_file) as f:
|
|
json_data = json.load(f)
|
|
for result_dict in json_data:
|
|
logging.debug("Parsing result_dict: {0}".format(result_dict))
|
|
ignored_commits[(result_dict['source'], result_dict['target'])] =\
|
|
set([ commit["hash"] for commit in result_dict['commits'] ])
|
|
return ignored_commits
|
|
|
|
def build_commit_map(branch, merge_base):
|
|
'''Creates a map from change id to (hash, subject, author, date, body).'''
|
|
# Disable git pager in order for the sh.git.log command to work
|
|
os.environ['GIT_PAGER'] = ''
|
|
|
|
fields = ['%H', '%s', '%an', '%cd', '%b']
|
|
pretty_format = '\x1f'.join(fields) + '\x1e'
|
|
result = OrderedDict()
|
|
for line in subprocess.check_output(
|
|
["git", "log", branch, "^" + merge_base, "--pretty=" + pretty_format,
|
|
"--color=never"], universal_newlines=True).split('\x1e'):
|
|
if line == "":
|
|
# if no changes are identified by the git log, we get an empty string
|
|
continue
|
|
if line == "\n":
|
|
# git log adds a newline to the end; we can skip it
|
|
continue
|
|
commit_hash, subject, author, date, body = [t.strip() for t in line.split('\x1f')]
|
|
change_id_matches = re.findall('Change-Id: (.*)', body)
|
|
if change_id_matches:
|
|
if len(change_id_matches) > 1:
|
|
logging.warning("Commit %s contains multiple change ids; using first one.",
|
|
commit_hash)
|
|
change_id = change_id_matches[0]
|
|
result[change_id] = (commit_hash, subject, author, date, body)
|
|
else:
|
|
logging.warning('Commit {0} ({1}...) has no Change-Id.'.format(
|
|
commit_hash, subject[:40]))
|
|
logging.debug("Commit map for branch %s has size %d.", branch, len(result))
|
|
return result
|
|
|
|
def cherrypick(cherry_pick_hashes, full_target_branch_name, partial_ok):
|
|
"""Cherrypicks the given commits.
|
|
|
|
Also, asserts that full_target_branch_name matches the current HEAD.
|
|
|
|
cherry_pick_hashes is a list of git hashes, in the order to
|
|
be cherry-picked.
|
|
|
|
If partial_ok is true, return gracefully if at least one cherrypick
|
|
has succeeded.
|
|
|
|
Note that this function does not push to the remote.
|
|
"""
|
|
print("Cherrypicking %d changes." % (len(cherry_pick_hashes),))
|
|
|
|
if len(cherry_pick_hashes) == 0:
|
|
return
|
|
|
|
# Cherrypicking only makes sense if we're on the equivalent of the target branch.
|
|
head_sha = subprocess.check_output(
|
|
['git', 'rev-parse', 'HEAD'], universal_newlines=True).strip()
|
|
target_branch_sha = subprocess.check_output(
|
|
['git', 'rev-parse', full_target_branch_name], universal_newlines=True).strip()
|
|
if head_sha != target_branch_sha:
|
|
print("Cannot cherrypick because %s (%s) and HEAD (%s) are divergent." % (
|
|
full_target_branch_name, target_branch_sha, head_sha))
|
|
sys.exit(1)
|
|
|
|
cherry_pick_hashes.reverse()
|
|
for i, cherry_pick_hash in enumerate(cherry_pick_hashes):
|
|
ret = subprocess.call(
|
|
['git', 'cherry-pick', '--keep-redundant-commits', cherry_pick_hash])
|
|
if ret != 0:
|
|
if partial_ok and i > 0:
|
|
subprocess.check_call(['git', 'cherry-pick', '--abort'])
|
|
print("Failed to cherry-pick %s; stopping picks." % (cherry_pick_hash,))
|
|
return
|
|
else:
|
|
raise Exception("Failed to cherry-pick: %s" % (cherry_pick_hash,))
|
|
|
|
def main():
|
|
parser = create_parser()
|
|
options = parser.parse_args()
|
|
|
|
log_level = logging.WARNING
|
|
if options.verbose:
|
|
log_level = logging.DEBUG
|
|
logging.basicConfig(level=log_level,
|
|
format='%(asctime)s %(threadName)s %(levelname)s: %(message)s')
|
|
|
|
if options.target_remote_name is None:
|
|
options.target_remote_name = options.source_remote_name
|
|
|
|
# Ensure all branches are up to date, unless remotes are disabled
|
|
# by specifying them with an empty string.
|
|
if options.source_remote_name != "":
|
|
subprocess.check_call(['git', 'fetch', options.source_remote_name,
|
|
options.source_branch])
|
|
full_source_branch_name = options.source_remote_name + '/' + options.source_branch
|
|
else:
|
|
full_source_branch_name = options.source_branch
|
|
if options.target_remote_name != "":
|
|
if options.source_remote_name != options.target_remote_name\
|
|
or options.source_branch != options.target_branch:
|
|
subprocess.check_call(['git', 'fetch', options.target_remote_name,
|
|
options.target_branch])
|
|
full_target_branch_name = options.target_remote_name + '/' + options.target_branch
|
|
else:
|
|
full_target_branch_name = options.target_branch
|
|
|
|
merge_base = subprocess.check_output(["git", "merge-base",
|
|
full_source_branch_name, full_target_branch_name], universal_newlines=True).strip()
|
|
source_commits = build_commit_map(full_source_branch_name, merge_base)
|
|
target_commits = build_commit_map(full_target_branch_name, merge_base)
|
|
|
|
ignored_commits = read_ignored_commits(options.ignored_commits_file)
|
|
logging.debug("ignored commits from {0}:\n{1}"
|
|
.format(options.ignored_commits_file, pformat(ignored_commits)))
|
|
commits_ignored = [] # Track commits actually ignored for debug logging
|
|
|
|
cherry_pick_hashes = []
|
|
print('-' * 80)
|
|
print('Commits in {0} but not in {1}:'.format(
|
|
full_source_branch_name, full_target_branch_name))
|
|
print('-' * 80)
|
|
jira_keys = []
|
|
jira_key_pat = re.compile(r'(IMPALA-\d+)')
|
|
skip_commits_matching = options.skip_commits_matching.format(
|
|
branch=options.target_branch)
|
|
for change_id, (commit_hash, msg, author, date, body) in source_commits.items():
|
|
change_in_target = change_id in target_commits
|
|
ignore_by_config = commit_hash in ignored_commits[
|
|
(options.source_branch, options.target_branch)]
|
|
ignore_by_commit_message = re.search(skip_commits_matching, "\n".join([msg, body]),
|
|
re.IGNORECASE)
|
|
# This conditional block just for debug logging of ignored commits
|
|
if ignore_by_config or ignore_by_commit_message:
|
|
if change_in_target:
|
|
logging.debug("Not ignoring commit because change is already in target: {0}"
|
|
.format(commit_hash))
|
|
else:
|
|
if ignore_by_commit_message:
|
|
logging.debug("Ignoring commit {0} by commit message.".format(commit_hash))
|
|
else:
|
|
logging.debug("Ignoring commit {0} by config file.".format(commit_hash))
|
|
commits_ignored.append(commit_hash)
|
|
else:
|
|
logging.debug("NOT ignoring commit {0} since not in ignored commits ({1},{2})"
|
|
.format(commit_hash, options.source_branch, options.target_branch))
|
|
if not change_in_target and not ignore_by_config and not ignore_by_commit_message:
|
|
print('{0} {1} ({2}) - {3}'.format(commit_hash, msg, date, author))
|
|
cherry_pick_hashes.append(commit_hash)
|
|
jira_keys += jira_key_pat.findall(msg)
|
|
|
|
print('-' * 80)
|
|
|
|
print("Jira keys referenced (Note: not all commit messages will reference a jira key):")
|
|
print(','.join(jira_keys))
|
|
print('-' * 80)
|
|
|
|
logging.debug("Commits actually ignored (change was not in target): {0}"
|
|
.format(pformat(commits_ignored)))
|
|
|
|
if options.cherry_pick:
|
|
cherrypick(cherry_pick_hashes, full_target_branch_name, options.partial_ok)
|
|
|
|
if __name__ == '__main__':
|
|
main()
|