#!/usr/bin/env python # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. HELP = ''' Compares two specified branches, using the Gerrit Change-Id as the primary identifier. Ignored commits can be added via a JSON configuration file or with a special string in the commit message. Changes can be cherrypicked with the --cherry_pick argument. This script can be used to keep two development branches (by default, "master" and "2.x", in sync). It is equivalent to cherry-picking commits one by one, but automates identifying the commits to cherry-pick. Unlike "git cherry", it uses the Gerrit Change-Id identifier in the commit message as a key. The ignored_commits.json configuration file is of the following form. Note that commits are the full 20-byte git hashes. [ { "source": "master", "target": "2.x", "commits": [ { "hash": "aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa", "comment": "...", { "hash": "bbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbbb", "comment": "..."} ] } ] The --target_remote_name is optional. If not specified, the target remote is set to the value of the --source_remote_name. Debug logging to stderr can be enabled with --verbose. Example: $bin/compare_branches.py --source_branch master --target_branch 2.x -------------------------------------------------------------------------------- Commits in asf-gerrit/master but not in asf-gerrit/2.x: -------------------------------------------------------------------------------- 35a3e186d61b8f365b0f7d1127be311758437e16 IMPALA-5478: Run TPCDS queries with decimal_v2 enabled (Thu Jan 18 03:28:51 2018 +0000) - Taras Bobrovytsky d9b6fd073055b436c7404d49454dc215b2c7a369 IMPALA-6386: Invalidate metadata at table level for dataload (Wed Jan 17 22:52:58 2018 +0000) - Joe McDonnell dcc7be0ed483b332dac22d6596f56ff2a6cfdaa3 IMPALA-4315: Allow USE and SHOW TABLES if the user has only column privileges (Wed Jan 17 22:40:13 2018 +0000) - Csaba Ringhofer b6e43133e671773d2757612f72cfcdb0ff303226 IMPALA-6399: Increase timeout in test_observability to reduce flakiness (Wed Jan 17 22:31:33 2018 +0000) - Lars Volker -------------------------------------------------------------------------------- Jira keys referenced (Note: not all commit messages will reference a jira key): IMPALA-5478,IMPALA-6386,IMPALA-4315,IMPALA-6399 -------------------------------------------------------------------------------- ''' import argparse import json import logging import os import re import sh import sys from collections import defaultdict try: from collections import OrderedDict except ImportError: from ordereddict import OrderedDict from pprint import pformat def create_parser(): class CustomFormatter(argparse.ArgumentDefaultsHelpFormatter, argparse.RawDescriptionHelpFormatter): """ Mix-in to leave the description alone, but show defaults. """ pass parser = argparse.ArgumentParser( formatter_class=CustomFormatter, description=HELP) parser.add_argument('--cherry_pick', action='store_true', default=False, help='Cherry-pick mismatched commits to current branch. This ' + 'must match (in the hash sense) the target branch.') parser.add_argument('--source_branch', default='master') parser.add_argument('--target_branch', default='2.x') parser.add_argument('--source_remote_name', default='asf-gerrit', help='Name of the source git remote. If set to empty string, ' + 'this remote is not fetched and branch names are used ' + ' as is; otherwise, the source ref is remote/branch.') parser.add_argument('--target_remote_name', default=None, help='Name of the target git remote; defaults to source remote. ' + 'Empty strings are handled the same way as --source_remote_name.') default_ignored_commits_path = os.path.join( os.path.dirname(os.path.abspath(__file__)), 'ignored_commits.json') parser.add_argument('--ignored_commits_file', default=default_ignored_commits_path, help='JSON File that contains ignored commits as specified in the help') parser.add_argument('--skip_commits_matching', default="Cherry-picks: not for {branch}", help='String in commit messages that causes the commit to be ignored. ' + ' {branch} is replaced with target branch; the search is case-insensitive') parser.add_argument('--verbose', '-v', action='store_true', default=False, help='Turn on DEBUG and INFO logging') return parser def read_ignored_commits(ignored_commits_file): '''Returns a dictionary containing commits that should be ignored. ignored_commits_file is a path to a JSON file with schema specified at the top of this file. The return structure has dictionary keys are a tuple containing (source_branch, target_branch) and values are a set of git hashes. ''' ignored_commits = defaultdict(set) with open(ignored_commits_file) as f: json_data = json.load(f) for result_dict in json_data: logging.debug("Parsing result_dict: {0}".format(result_dict)) ignored_commits[(result_dict['source'], result_dict['target'])] =\ set([ commit["hash"] for commit in result_dict['commits'] ]) return ignored_commits def build_commit_map(branch, merge_base): '''Creates a map from change id to (hash, subject, author, date, body).''' # Disable git pager in order for the sh.git.log command to work os.environ['GIT_PAGER'] = '' fields = ['%H', '%s', '%an', '%cd', '%b'] pretty_format = '\x1f'.join(fields) + '\x1e' result = OrderedDict() for line in sh.git.log( branch, "^" + merge_base, pretty=pretty_format, color='never').split('\x1e'): if line == "": # if no changes are identified by the git log, we get an empty string continue if line == "\n": # git log adds a newline to the end; we can skip it continue commit_hash, subject, author, date, body = [t.strip() for t in line.split('\x1f')] change_id_matches = re.findall('Change-Id: (.*)', body) if change_id_matches: if len(change_id_matches) > 1: logging.warning("Commit %s contains multiple change ids; using first one.", commit_hash) change_id = change_id_matches[0] result[change_id] = (commit_hash, subject, author, date, body) else: logging.warning('Commit {0} ({1}...) has no Change-Id.'.format( commit_hash, subject[:40])) logging.debug("Commit map for branch %s has size %d.", branch, len(result)) return result def cherrypick(cherry_pick_hashes, full_target_branch_name): """Cherrypicks the given commits. Also, asserts that full_target_branch_name matches the current HEAD. cherry_pick_hashes is a list of git hashes, in the order to be cherry-picked. Note that this function does not push to the remote. """ print "Cherrypicking %d changes." % (len(cherry_pick_hashes),) if len(cherry_pick_hashes) == 0: return # Cherrypicking only makes sense if we're on the equivalent of the target branch. head_sha = sh.git('rev-parse', 'HEAD').strip() target_branch_sha = sh.git('rev-parse', full_target_branch_name).strip() if head_sha != target_branch_sha: print "Cannot cherrypick because %s (%s) and HEAD (%s) are divergent." % ( full_target_branch_name, target_branch_sha, head_sha) sys.exit(1) cherry_pick_hashes.reverse() for cherry_pick_hash in cherry_pick_hashes: sh.git('cherry-pick', '--keep-redundant-commits', cherry_pick_hash) def main(): parser = create_parser() options = parser.parse_args() log_level = logging.WARNING if options.verbose: log_level = logging.DEBUG logging.basicConfig(level=log_level, format='%(asctime)s %(threadName)s %(levelname)s: %(message)s') if options.target_remote_name is None: options.target_remote_name = options.source_remote_name # Ensure all branches are up to date, unless remotes are disabled # by specifying them with an empty string. if options.source_remote_name != "": sh.git.fetch(options.source_remote_name) full_source_branch_name = options.source_remote_name + '/' + options.source_branch else: full_source_branch_name = options.source_branch if options.target_remote_name != "": if options.source_remote_name != options.target_remote_name: sh.git.fetch(options.target_remote_name) full_target_branch_name = options.target_remote_name + '/' + options.target_branch else: full_target_branch_name = options.target_branch merge_base = sh.git("merge-base", full_source_branch_name, full_target_branch_name).strip() source_commits = build_commit_map(full_source_branch_name, merge_base) target_commits = build_commit_map(full_target_branch_name, merge_base) ignored_commits = read_ignored_commits(options.ignored_commits_file) logging.debug("ignored commits from {0}:\n{1}" .format(options.ignored_commits_file, pformat(ignored_commits))) commits_ignored = [] # Track commits actually ignored for debug logging cherry_pick_hashes = [] print '-' * 80 print 'Commits in {0} but not in {1}:'.format( full_source_branch_name, full_target_branch_name) print '-' * 80 jira_keys = [] jira_key_pat = re.compile(r'(IMPALA-\d+)') skip_commits_matching = options.skip_commits_matching.replace( "{branch}", options.target_branch) for change_id, (commit_hash, msg, author, date, body) in source_commits.iteritems(): change_in_target = change_id in target_commits ignore_by_config = commit_hash in ignored_commits[ (options.source_branch, options.target_branch)] ignore_by_commit_message = skip_commits_matching.lower() in msg.lower() \ or skip_commits_matching.lower() in body.lower() # This conditional block just for debug logging of ignored commits if ignore_by_config or ignore_by_commit_message: if change_in_target: logging.debug("Not ignoring commit because change is already in target: {0}" .format(commit_hash)) else: if ignore_by_commit_message: logging.debug("Ignoring commit {0} by commit message.".format(commit_hash)) else: logging.debug("Ignoring commit {0} by config file.".format(commit_hash)) commits_ignored.append(commit_hash) else: logging.debug("NOT ignoring commit {0} since not in ignored commits ({1},{2})" .format(commit_hash, options.source_branch, options.target_branch)) if not change_in_target and not ignore_by_config and not ignore_by_commit_message: print u'{0} {1} ({2}) - {3}'.format(commit_hash, msg, date, author) cherry_pick_hashes.append(commit_hash) jira_keys += jira_key_pat.findall(msg) print '-' * 80 print "Jira keys referenced (Note: not all commit messages will reference a jira key):" print ','.join(jira_keys) print '-' * 80 logging.debug("Commits actually ignored (change was not in target): {0}" .format(pformat(commits_ignored))) if options.cherry_pick: cherrypick(cherry_pick_hashes, full_target_branch_name) if __name__ == '__main__': main()