Commit 01d2cde9 authored by mgiuca@chromium.org's avatar mgiuca@chromium.org

git hyper-blame: Added approx. line number translation.

Previously, when a commit was skipped, it would be blamed on the line
number the line had *after* the skipped commit. This could mean a
totally unrelated commit gets blamed. Now, a heuristic analyses the diff
of the skipped commit to discover approximately what line number the
line had *before* the skipped commit, so it can hopefully be blamed on
the right commit.

BUG=574290

Review URL: https://codereview.chromium.org/1629253002

git-svn-id: svn://svn.chromium.org/chrome/trunk/tools/depot_tools@298609 0039d316-1c4b-4281-b951-d872f2087c98
parent 821e0a5d
......@@ -374,6 +374,10 @@ def del_config(option, scope='local'):
pass
def diff(oldrev, newrev, *args):
return run('diff', oldrev, newrev, *args)
def freeze():
took_action = False
......
......@@ -149,6 +149,110 @@ def get_parsed_blame(filename, revision='HEAD'):
return list(parse_blame(blame))
# Map from (oldrev, newrev) to hunk list (caching the results of git diff, but
# only the hunk line numbers, not the actual diff contents).
# hunk list contains (old, new) pairs, where old and new are (start, length)
# pairs. A hunk list can also be None (if the diff failed).
diff_hunks_cache = {}
def cache_diff_hunks(oldrev, newrev):
def parse_start_length(s):
# Chop the '-' or '+'.
s = s[1:]
# Length is optional (defaults to 1).
try:
start, length = s.split(',')
except ValueError:
start = s
length = 1
return int(start), int(length)
try:
return diff_hunks_cache[(oldrev, newrev)]
except KeyError:
pass
# Use -U0 to get the smallest possible hunks.
diff = git_common.diff(oldrev, newrev, '-U0')
# Get all the hunks.
hunks = []
for line in diff.split('\n'):
if not line.startswith('@@'):
continue
ranges = line.split(' ', 3)[1:3]
ranges = tuple(parse_start_length(r) for r in ranges)
hunks.append(ranges)
diff_hunks_cache[(oldrev, newrev)] = hunks
return hunks
def approx_lineno_across_revs(filename, newfilename, revision, newrevision,
lineno):
"""Computes the approximate movement of a line number between two revisions.
Consider line |lineno| in |filename| at |revision|. This function computes the
line number of that line in |newfilename| at |newrevision|. This is
necessarily approximate.
Args:
filename: The file (within the repo) at |revision|.
newfilename: The name of the same file at |newrevision|.
revision: A git revision.
newrevision: Another git revision. Note: Can be ahead or behind |revision|.
lineno: Line number within |filename| at |revision|.
Returns:
Line number within |newfilename| at |newrevision|.
"""
# This doesn't work that well if there are a lot of line changes within the
# hunk (demonstrated by GitHyperBlameLineMotionTest.testIntraHunkLineMotion).
# A fuzzy heuristic that takes the text of the new line and tries to find a
# deleted line within the hunk that mostly matches the new line could help.
# Use the <revision>:<filename> syntax to diff between two blobs. This is the
# only way to diff a file that has been renamed.
old = '%s:%s' % (revision, filename)
new = '%s:%s' % (newrevision, newfilename)
hunks = cache_diff_hunks(old, new)
cumulative_offset = 0
# Find the hunk containing lineno (if any).
for (oldstart, oldlength), (newstart, newlength) in hunks:
cumulative_offset += newlength - oldlength
if lineno >= oldstart + oldlength:
# Not there yet.
continue
if lineno < oldstart:
# Gone too far.
break
# lineno is in [oldstart, oldlength] at revision; [newstart, newlength] at
# newrevision.
# If newlength == 0, newstart will be the line before the deleted hunk.
# Since the line must have been deleted, just return that as the nearest
# line in the new file. Caution: newstart can be 0 in this case.
if newlength == 0:
return max(1, newstart)
newend = newstart + newlength - 1
# Move lineno based on the amount the entire hunk shifted.
lineno = lineno + newstart - oldstart
# Constrain the output within the range [newstart, newend].
return min(newend, max(newstart, lineno))
# Wasn't in a hunk. Figure out the line motion based on the difference in
# length between the hunks seen so far.
return lineno + cumulative_offset
def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
err=sys.stderr):
# Map from commit to parsed blame from that commit.
......@@ -189,23 +293,19 @@ def hyper_blame(ignored, filename, revision='HEAD', out=sys.stdout,
# ignore this commit.
break
# line.lineno_then is the line number in question at line.commit.
# TODO(mgiuca): This will be incorrect if line.commit added or removed
# lines. Translate that line number so that it refers to the position of
# the same line on previouscommit.
lineno_previous = line.lineno_then
# line.lineno_then is the line number in question at line.commit. We need
# to translate that line number so that it refers to the position of the
# same line on previouscommit.
lineno_previous = approx_lineno_across_revs(
line.commit.filename, previousfilename, line.commit.commithash,
previouscommit, line.lineno_then)
logging.debug('ignore commit %s on line p%d/t%d/n%d',
line.commit.commithash, lineno_previous, line.lineno_then,
line.lineno_now)
# Get the line at lineno_previous in the parent commit.
assert lineno_previous > 0
try:
newline = parent_blame[lineno_previous - 1]
except IndexError:
# lineno_previous is a guess, so it may be past the end of the file.
# Just grab the last line in the file.
newline = parent_blame[-1]
assert 1 <= lineno_previous <= len(parent_blame)
newline = parent_blame[lineno_previous - 1]
# Replace the commit and lineno_then, but not the lineno_now or context.
logging.debug(' replacing with %r', newline)
......
......@@ -820,19 +820,24 @@ commit) since the given person wrote it.</p></div>
</div>
</div>
<div class="sect1">
<h2 id="_caveats">CAVEATS</h2>
<div class="sectionbody">
<div class="paragraph"><p>When a line skips over an ignored commit, a guess is made as to which commit
previously modified that line, but it is not always clear where the line came
from. If the ignored commit makes lots of changes in close proximity, in
particular adding/removing/reordering lines, then the wrong authors may be
blamed for nearby edits.</p></div>
<div class="paragraph"><p>For this reason, <code>hyper-blame</code> works best when the ignored commits are be
limited to minor changes such as formatting and renaming, not refactoring or
other more invasive changes.</p></div>
</div>
</div>
<div class="sect1">
<h2 id="_bugs">BUGS</h2>
<div class="sectionbody">
<div class="ulist"><ul>
<li>
<p>
When a commit is ignored, hyper-blame currently just blames the same line in
the previous version of the file. This can be wildly inaccurate if the ignored
commit adds or removes lines, resulting in a completely wrong commit being
blamed.
</p>
</li>
<li>
<p>
There is currently no way to pass the ignore list as a file.
</p>
</li>
......@@ -864,7 +869,7 @@ from <a href="https://chromium.googlesource.com/chromium/tools/depot_tools.git">
<div id="footnotes"><hr /></div>
<div id="footer">
<div id="footer-text">
Last updated 2016-01-28 16:40:21 AEDT
Last updated 2016-02-05 13:43:52 AEDT
</div>
</div>
</body>
......
......@@ -2,12 +2,12 @@
.\" Title: git-hyper-blame
.\" Author: [FIXME: author] [see http://docbook.sf.net/el/author]
.\" Generator: DocBook XSL Stylesheets v1.78.1 <http://docbook.sf.net/>
.\" Date: 01/28/2016
.\" Date: 02/05/2016
.\" Manual: Chromium depot_tools Manual
.\" Source: depot_tools 7143379
.\" Source: depot_tools d2dbf32
.\" Language: English
.\"
.TH "GIT\-HYPER\-BLAME" "1" "01/28/2016" "depot_tools 7143379" "Chromium depot_tools Manual"
.TH "GIT\-HYPER\-BLAME" "1" "02/05/2016" "depot_tools d2dbf32" "Chromium depot_tools Manual"
.\" -----------------------------------------------------------------
.\" * Define some portability stuff
.\" -----------------------------------------------------------------
......@@ -93,18 +93,12 @@ c6eb3bfa (lorem 2014\-08\-11 23:15:57 +0000 5) NOSTRUD EXERCITATION ULLAMCO LAB
.sp
.sp
hyper\-blame places a * next to any line where it has skipped over an ignored commit, so you know that the line in question has been changed (by an ignored commit) since the given person wrote it\&.
.SH "BUGS"
.SH "CAVEATS"
.sp
.RS 4
.ie n \{\
\h'-04'\(bu\h'+03'\c
.\}
.el \{\
.sp -1
.IP \(bu 2.3
.\}
When a commit is ignored, hyper\-blame currently just blames the same line in the previous version of the file\&. This can be wildly inaccurate if the ignored commit adds or removes lines, resulting in a completely wrong commit being blamed\&.
.RE
When a line skips over an ignored commit, a guess is made as to which commit previously modified that line, but it is not always clear where the line came from\&. If the ignored commit makes lots of changes in close proximity, in particular adding/removing/reordering lines, then the wrong authors may be blamed for nearby edits\&.
.sp
For this reason, hyper\-blame works best when the ignored commits are be limited to minor changes such as formatting and renaming, not refactoring or other more invasive changes\&.
.SH "BUGS"
.sp
.RS 4
.ie n \{\
......
......@@ -51,13 +51,22 @@ demo:2[]
commit, so you know that the line in question has been changed (by an ignored
commit) since the given person wrote it.
CAVEATS
-------
When a line skips over an ignored commit, a guess is made as to which commit
previously modified that line, but it is not always clear where the line came
from. If the ignored commit makes lots of changes in close proximity, in
particular adding/removing/reordering lines, then the wrong authors may be
blamed for nearby edits.
For this reason, `hyper-blame` works best when the ignored commits are be
limited to minor changes such as formatting and renaming, not refactoring or
other more invasive changes.
BUGS
----
- When a commit is ignored, hyper-blame currently just blames the same line in
the previous version of the file. This can be wildly inaccurate if the ignored
commit adds or removes lines, resulting in a completely wrong commit being
blamed.
- There is currently no way to pass the ignore list as a file.
- It should be possible for a git repository to configure an automatic list of
commits to ignore (like `.gitignore`), so that project owners can maintain a
......
......@@ -254,6 +254,23 @@ class GitReadOnlyFunctionsTest(git_test_utils.GitRepoReadOnlyTestBase,
self.assertEqual(self.repo.run(set, self.gc.branches()),
{'master', 'branch_D', 'root_A'})
def testDiff(self):
# Get the names of the blobs being compared (to avoid hard-coding).
c_blob_short = self.repo.git('rev-parse', '--short',
'tag_C:some/files/file2').stdout.strip()
d_blob_short = self.repo.git('rev-parse', '--short',
'tag_D:some/files/file2').stdout.strip()
expected_output = [
'diff --git a/some/files/file2 b/some/files/file2',
'index %s..%s 100755' % (c_blob_short, d_blob_short),
'--- a/some/files/file2',
'+++ b/some/files/file2',
'@@ -1 +1,2 @@',
' file2 - vanilla',
'+file2 - merged']
self.assertEqual(expected_output,
self.repo.run(self.gc.diff, 'tag_C', 'tag_D').split('\n'))
def testDormant(self):
self.assertFalse(self.repo.run(self.gc.is_dormant, 'master'))
self.repo.git('config', 'branch.master.dormant', 'true')
......
......@@ -279,7 +279,7 @@ class GitHyperBlameSimpleTest(GitHyperBlameTestBase):
class GitHyperBlameLineMotionTest(GitHyperBlameTestBase):
REPO_SCHEMA = """
A B C D E
A B C D E F
"""
COMMIT_A = {
......@@ -293,44 +293,108 @@ class GitHyperBlameLineMotionTest(GitHyperBlameTestBase):
# Insert 2 lines at the top,
# Change "yellow" to "red".
# Insert 1 line at the bottom.
COMMIT_C = {
'file': {'data': 'X\nY\nA\nred\nblue\n'},
'file': {'data': 'X\nY\nA\nred\nblue\nZ\n'},
}
# Insert 2 more lines at the top.
COMMIT_D = {
'file': {'data': 'earth\nfire\nX\nY\nA\nred\nblue\n'},
'file': {'data': 'earth\nfire\nX\nY\nA\nred\nblue\nZ\n'},
}
# Insert a line before "red", and indent "red" and "blue".
COMMIT_E = {
'file': {'data': 'earth\nfire\nX\nY\nA\ncolors:\n red\n blue\n'},
'file': {'data': 'earth\nfire\nX\nY\nA\ncolors:\n red\n blue\nZ\n'},
}
# Insert a line between "A" and "colors".
COMMIT_F = {
'file': {'data': 'earth\nfire\nX\nY\nA\nB\ncolors:\n red\n blue\nZ\n'},
}
def testCacheDiffHunks(self):
"""Tests the cache_diff_hunks internal function."""
expected_hunks = [((0, 0), (1, 2)),
((2, 1), (4, 1)),
((3, 0), (6, 1)),
]
hunks = self.repo.run(self.git_hyper_blame.cache_diff_hunks, 'tag_B',
'tag_C')
self.assertEqual(expected_hunks, hunks)
def testApproxLinenoAcrossRevs(self):
"""Tests the approx_lineno_across_revs internal function."""
# Note: For all of these tests, the "old revision" and "new revision" are
# reversed, which matches the usage by hyper_blame.
# Test an unchanged line before any hunks in the diff. Should be unchanged.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_B', 'tag_A', 1)
self.assertEqual(1, lineno)
# Test an unchanged line after all hunks in the diff. Should be matched to
# the line's previous position in the file.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_D', 'tag_C', 6)
self.assertEqual(4, lineno)
# Test a line added in a new hunk. Should be matched to the line *before*
# where the hunk was inserted in the old version of the file.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_F', 'tag_E', 6)
self.assertEqual(5, lineno)
# Test lines added in a new hunk at the very start of the file. This tests
# an edge case: normally it would be matched to the line *before* where the
# hunk was inserted (Line 0), but since the hunk is at the start of the
# file, we match to Line 1.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_C', 'tag_B', 1)
self.assertEqual(1, lineno)
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_C', 'tag_B', 2)
self.assertEqual(1, lineno)
# Test an unchanged line in between hunks in the diff. Should be matched to
# the line's previous position in the file.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_C', 'tag_B', 3)
self.assertEqual(1, lineno)
# Test a changed line. Should be matched to the hunk's previous position in
# the file.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_C', 'tag_B', 4)
self.assertEqual(2, lineno)
# Test a line added in a new hunk at the very end of the file. Should be
# matched to the line *before* where the hunk was inserted (the last line of
# the file). Technically same as the case above but good to boundary test.
lineno = self.repo.run(self.git_hyper_blame.approx_lineno_across_revs,
'file', 'file', 'tag_C', 'tag_B', 6)
self.assertEqual(3, lineno)
def testInterHunkLineMotion(self):
"""Tests a blame with line motion in another hunk in the ignored commit."""
# This test was mostly written as a demonstration of the limitations of the
# current algorithm (it exhibits non-ideal behaviour).
# Blame from D, ignoring C.
# Lines 1, 2 were added by D.
# Lines 3, 4 were added by C (but ignored, so blame A, B, respectively).
# TODO(mgiuca): Ideally, this would blame both of these lines on A, because
# they add lines nowhere near the changes made by B.
# Lines 3, 4 were added by C (but ignored, so blame A).
# Line 5 was added by A.
# Line 6 was modified by C (but ignored, so blame A).
# TODO(mgiuca): Ideally, Line 6 would be blamed on B, because that was the
# last commit to touch that line (changing "green" to "yellow"), but the
# algorithm isn't yet able to figure out that Line 6 in D == Line 4 in C ~=
# Line 2 in B.
# Line 6 was modified by C (but ignored, so blame B). (Note: This requires
# the algorithm to figure out that Line 6 in D == Line 4 in C ~= Line 2 in
# B, so it blames B. Otherwise, it would blame A.)
# Line 7 was added by A.
# Line 8 was added by C (but ignored, so blame A).
expected_output = [self.blame_line('D', ' 1) earth'),
self.blame_line('D', ' 2) fire'),
self.blame_line('A', '3*) X'),
self.blame_line('B', '4*) Y'),
self.blame_line('A', '4*) Y'),
self.blame_line('A', ' 5) A'),
self.blame_line('A', '6*) red'),
self.blame_line('B', '6*) red'),
self.blame_line('A', ' 7) blue'),
self.blame_line('A', '8*) Z'),
]
retval, output = self.run_hyperblame(['C'], 'file', 'tag_D')
self.assertEqual(0, retval)
......@@ -356,6 +420,7 @@ class GitHyperBlameLineMotionTest(GitHyperBlameTestBase):
self.blame_line('C', '6*) colors:'),
self.blame_line('A', '7*) red'),
self.blame_line('A', '8*) blue'),
self.blame_line('C', ' 9) Z'),
]
retval, output = self.run_hyperblame(['E'], 'file', 'tag_E')
self.assertEqual(0, retval)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment