git_hyper_blame.py

#!/usr/bin/env python
# Copyright 2016 The Chromium Authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.

"""Wrapper around git blame that ignores certain commits.
"""

from __future__ import print_function

import argparse
import collections
import logging
import os
import subprocess2
import sys

import git_common
import git_dates
import setup_color


logging.getLogger().setLevel(logging.INFO)


DEFAULT_IGNORE_FILE_NAME = '.git-blame-ignore-revs'


class Commit(object):
  """Info about a commit."""
  def __init__(self, commithash):
    self.commithash = commithash
    self.author = None
    self.author_mail = None
    self.author_time = None
    self.author_tz = None
    self.committer = None
    self.committer_mail = None
    self.committer_time = None
    self.committer_tz = None
    self.summary = None
    self.boundary = None
    self.previous = None
    self.filename = None

  def __repr__(self):  # pragma: no cover
    return '<Commit %s>' % self.commithash


BlameLine = collections.namedtuple(
    'BlameLine',
    'commit context lineno_then lineno_now modified')


def parse_blame(blameoutput):
  """Parses the output of git blame -p into a data structure."""
  lines = blameoutput.split('\n')
  i = 0
  commits = {}

  while i < len(lines):
    # Read a commit line and parse it.
    line = lines[i]
    i += 1
    if not line.strip():
      continue
    commitline = line.split()
    commithash = commitline[0]
    lineno_then = int(commitline[1])
    lineno_now = int(commitline[2])

    try:
      commit = commits[commithash]
    except KeyError:
      commit = Commit(commithash)
      commits[commithash] = commit

    # Read commit details until we find a context line.
    while i < len(lines):
      line = lines[i]
      i += 1
      if line.startswith('\t'):
        break

      try:
        key, value = line.split(' ', 1)
      except ValueError:
        key = line
        value = True
      setattr(commit, key.replace('-', '_'), value)

    context = line[1:]

    yield BlameLine(commit, context, lineno_then, lineno_now, False)


def print_table(table, colsep=' ', rowsep='\n', align=None, out=sys.stdout):
  """Print a 2D rectangular array, aligning columns with spaces.

  Args:
    align: Optional string of 'l' and 'r', designating whether each column is
           left- or right-aligned. Defaults to left aligned.
  """
  if len(table) == 0:
    return

  colwidths = None
  for row in table:
    if colwidths is None:
      colwidths = [len(x) for x in row]
    else:
      colwidths = [max(colwidths[i], len(x)) for i, x in enumerate(row)]

  if align is None:  # pragma: no cover
    align = 'l' * len(colwidths)

  for row in table:
    cells = []
    for i, cell in enumerate(row):
      padding = ' ' * (colwidths[i] - len(cell))
      if align[i] == 'r':
        cell = padding + cell
      elif i < len(row) - 1:
        # Do not pad the final column if left-aligned.
        cell += padding
      cells.append(cell)
    try:
      print(*cells, sep=colsep, end=rowsep, file=out)
    except IOError:  # pragma: no cover
      # Can happen on Windows if the pipe is closed early.
      pass


def pretty_print(parsedblame, show_filenames=False, out=sys.stdout):
  """Pretty-prints the output of parse_blame."""
  table = []
  for line in parsedblame:
    author_time = git_dates.timestamp_offset_to_datetime(
        line.commit.author_time, line.commit.author_tz)
    row = [line.commit.commithash[:8],
           '(' + line.commit.author,
           git_dates.datetime_string(author_time),
           str(line.lineno_now) + ('*' if line.modified else '') + ')',
           line.context]
    if show_filenames:
      row.insert(1, line.commit.filename)
    table.append(row)
  print_table(table, align='llllrl' if show_filenames else 'lllrl', out=out)


def get_parsed_blame(filename, revision='HEAD'):
  blame = git_common.blame(filename, revision=revision, porcelain=True)
  return list(parse_blame(blame))


# Map from (oldrev, newrev) to hunk list (caching the results of git diff, but
# only the hunk line numbers, not the actual diff contents).
# hunk list contains (old, new) pairs, where old and new are (start, length)
# pairs. A hunk list can also be None (if the diff failed).
diff_hunks_cache = {}


def cache_diff_hunks(oldrev, newrev):
  def parse_start_length(s):
    # Chop the '-' or '+'.
    s = s[1:]
    # Length is optional (defaults to 1).
    try:
      start, length = s.split(',')
    except ValueError:
      start = s
      length = 1
    return int(start), int(length)

  try:
    return diff_hunks_cache[(oldrev, newrev)]
  except KeyError:
    pass

  # Use -U0 to get the smallest possible hunks.
  diff = git_common.diff(oldrev, newrev, '-U0')

  # Get all the hunks.
  hunks = []
  for line in diff.split('\n'):
    if not line.startswith('@@'):
      continue
    ranges = line.split(' ', 3)[1:3]
    ranges = tuple(parse_start_length(r) for r in ranges)
    hunks.append(ranges)

  diff_hunks_cache[(oldrev, newrev)] = hunks
  return hunks


def approx_lineno_across_revs(filename, newfilename, revision, newrevision,
                              lineno):
  """Computes the approximate movement of a line number between two revisions.

  Consider line |lineno| in |filename| at |revision|. This function computes the
  line number of that line in |newfilename| at |newrevision|. This is
  necessarily approximate.

  Args:
    filename: The file (within the repo) at |revision|.
    newfilename: The name of the same file at |newrevision|.
    revision: A git revision.
    newrevision: Another git revision. Note: Can be ahead or behind |revision|.
    lineno: Line number within |filename| at |revision|.

  Returns:
    Line number within |newfilename| at |newrevision|.
  """
  # This doesn't work that well if there are a lot of line changes within the
  # hunk (demonstrated by GitHyperBlameLineMotionTest.testIntraHunkLineMotion).
  # A fuzzy heuristic that takes the text of the new line and tries to find a
  # deleted line within the hunk that mostly matches the new line could help.

  # Use the <revision>:<filename> syntax to diff between two blobs. This is the
  # only way to diff a file that has been renamed.
  old = '%s:%s' % (revision, filename)
  new = '%s:%s' % (newrevision, newfilename)
  hunks = cache_diff_hunks(old, new)

  cumulative_offset = 0

  # Find the hunk containing lineno (if any).
  for (oldstart, oldlength), (newstart, newlength) in hunks:
    cumulative_offset += newlength - oldlength

    if lineno >= oldstart + oldlength:
      # Not there yet.
      continue

    if lineno < oldstart:
      # Gone too far.
      break

    # lineno is in [oldstart, oldlength] at revision; [newstart, newlength] at
    # newrevision.

    # If newlength == 0, newstart will be the line before the deleted hunk.
    # Since the line must have been deleted, just return that as the nearest
    # line in the new file. Caution: newstart can be 0 in this case.
    if newlength == 0:
      return max(1, newstart)

    newend = newstart + newlength - 1

    # Move lineno based on the amount the entire hunk shifted.
    lineno = lineno + newstart - oldstart
    # Constrain the output within the range [newstart, newend].
    return min(newend, max(newstart, lineno))

  # Wasn't in a hunk. Figure out the line motion based on the difference in
  # length between the hunks seen so far.
  return lineno + cumulative_offset


def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
                err=sys.stderr):
  # Map from commit to parsed blame from that commit.
  blame_from = {}

  def cache_blame_from(filename, commithash):
    try:
      return blame_from[commithash]
    except KeyError:
      parsed = get_parsed_blame(filename, commithash)
      blame_from[commithash] = parsed
      return parsed

  try:
    parsed = cache_blame_from(filename, git_common.hash_one(revision))
  except subprocess2.CalledProcessError as e:
    err.write(e.stderr)
    return e.returncode

  new_parsed = []

  # We don't show filenames in blame output unless we have to.
  show_filenames = False

  for line in parsed:
    # If a line references an ignored commit, blame that commit's parent
    # repeatedly until we find a non-ignored commit.
    while line.commit.commithash in ignored:
      if line.commit.previous is None:
        # You can't ignore the commit that added this file.
        break

      previouscommit, previousfilename = line.commit.previous.split(' ', 1)
      parent_blame = cache_blame_from(previousfilename, previouscommit)

      if len(parent_blame) == 0:
        # The previous version of this file was empty, therefore, you can't
        # ignore this commit.
        break

      # line.lineno_then is the line number in question at line.commit. We need
      # to translate that line number so that it refers to the position of the
      # same line on previouscommit.
      lineno_previous = approx_lineno_across_revs(
          line.commit.filename, previousfilename, line.commit.commithash,
          previouscommit, line.lineno_then)
      logging.debug('ignore commit %s on line p%d/t%d/n%d',
                    line.commit.commithash, lineno_previous, line.lineno_then,
                    line.lineno_now)

      # Get the line at lineno_previous in the parent commit.
      assert 1 <= lineno_previous <= len(parent_blame)
      newline = parent_blame[lineno_previous - 1]

      # Replace the commit and lineno_then, but not the lineno_now or context.
      logging.debug('    replacing with %r', newline)
      line = BlameLine(newline.commit, line.context, lineno_previous,
                       line.lineno_now, True)

    # If any line has a different filename to the file's current name, turn on
    # filename display for the entire blame output.
    if line.commit.filename != filename:
      show_filenames = True

    new_parsed.append(line)

  pretty_print(new_parsed, show_filenames=show_filenames, out=out)

  return 0


def parse_ignore_file(ignore_file):
  for line in ignore_file:
    line = line.split('#', 1)[0].strip()
    if line:
      yield line


def main(args, stdout=sys.stdout, stderr=sys.stderr):
  parser = argparse.ArgumentParser(
      prog='git hyper-blame',
      description='git blame with support for ignoring certain commits.')
  parser.add_argument('-i', metavar='REVISION', action='append', dest='ignored',
                      default=[], help='a revision to ignore')
  parser.add_argument('--ignore-file', metavar='FILE',
                      type=argparse.FileType('r'), dest='ignore_file',
                      help='a file containing a list of revisions to ignore')
  parser.add_argument('--no-default-ignores', dest='no_default_ignores',
                      help='Do not ignore commits from .git-blame-ignore-revs.')
  parser.add_argument('revision', nargs='?', default='HEAD', metavar='REVISION',
                      help='revision to look at')
  parser.add_argument('filename', metavar='FILE', help='filename to blame')

  args = parser.parse_args(args)
  try:
    repo_root = git_common.repo_root()
  except subprocess2.CalledProcessError as e:
    stderr.write(e.stderr)
    return e.returncode

  # Make filename relative to the repository root, and cd to the root dir (so
  # all filenames throughout this script are relative to the root).
  filename = os.path.relpath(args.filename, repo_root)
  os.chdir(repo_root)

  # Normalize filename so we can compare it to other filenames git gives us.
  filename = os.path.normpath(filename)
  filename = os.path.normcase(filename)

  ignored_list = list(args.ignored)
  if not args.no_default_ignores and os.path.exists(DEFAULT_IGNORE_FILE_NAME):
    with open(DEFAULT_IGNORE_FILE_NAME) as ignore_file:
      ignored_list.extend(parse_ignore_file(ignore_file))

  if args.ignore_file:
    ignored_list.extend(parse_ignore_file(args.ignore_file))

  ignored = set()
  for c in ignored_list:
    try:
      ignored.add(git_common.hash_one(c))
    except subprocess2.CalledProcessError as e:
      # Custom warning string (the message from git-rev-parse is inappropriate).
      stderr.write('warning: unknown revision \'%s\'.\n' % c)

  return hyper_blame(ignored, filename, args.revision, out=stdout, err=stderr)


if __name__ == '__main__':  # pragma: no cover
  setup_color.init()
  with git_common.less() as less_input:
    sys.exit(main(sys.argv[1:], stdout=less_input))