patch.py 17.7 KB
Newer Older
1
# coding=utf8
2
# Copyright (c) 2012 The Chromium Authors. All rights reserved.
3 4 5 6
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
"""Utility functions to handle patches."""

7 8
import posixpath
import os
9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25
import re


class UnsupportedPatchFormat(Exception):
  def __init__(self, filename, status):
    super(UnsupportedPatchFormat, self).__init__(filename, status)
    self.filename = filename
    self.status = status

  def __str__(self):
    out = 'Can\'t process patch for file %s.' % self.filename
    if self.status:
      out += '\n%s' % self.status
    return out


class FilePatchBase(object):
26 27 28 29
  """Defines a single file being modified.

  '/' is always used instead of os.sep for consistency.
  """
30 31
  is_delete = False
  is_binary = False
32
  is_new = False
33

34
  def __init__(self, filename):
35
    assert self.__class__ is not FilePatchBase
36
    self.filename = self._process_filename(filename)
37 38
    # Set when the file is copied or moved.
    self.source_filename = None
39

40 41 42 43 44 45 46 47 48
  @property
  def filename_utf8(self):
    return self.filename.encode('utf-8')

  @property
  def source_filename_utf8(self):
    if self.source_filename is not None:
      return self.source_filename.encode('utf-8')

49 50 51
  @staticmethod
  def _process_filename(filename):
    filename = filename.replace('\\', '/')
52
    # Blacklist a few characters for simplicity.
53
    for i in ('$', '..', '\'', '"', '<', '>', ':', '|', '?', '*'):
54 55 56
      if i in filename:
        raise UnsupportedPatchFormat(
            filename, 'Can\'t use \'%s\' in filename.' % i)
57 58 59 60 61 62 63 64 65
    if filename.startswith('/'):
      raise UnsupportedPatchFormat(
          filename, 'Filename can\'t start with \'/\'.')
    if filename == 'CON':
      raise UnsupportedPatchFormat(
          filename, 'Filename can\'t be \'CON\'.')
    if re.match('COM\d', filename):
      raise UnsupportedPatchFormat(
          filename, 'Filename can\'t be \'%s\'.' % filename)
66
    return filename
67 68 69 70 71 72 73

  def set_relpath(self, relpath):
    if not relpath:
      return
    relpath = relpath.replace('\\', '/')
    if relpath[0] == '/':
      self._fail('Relative path starts with %s' % relpath[0])
74 75
    self.filename = self._process_filename(
        posixpath.join(relpath, self.filename))
76 77 78
    if self.source_filename:
      self.source_filename = self._process_filename(
          posixpath.join(relpath, self.source_filename))
79 80

  def _fail(self, msg):
81
    """Shortcut function to raise UnsupportedPatchFormat."""
82 83
    raise UnsupportedPatchFormat(self.filename, msg)

84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102
  def __str__(self):
    # Use a status-like board.
    out = ''
    if self.is_binary:
      out += 'B'
    else:
      out += ' '
    if self.is_delete:
      out += 'D'
    else:
      out += ' '
    if self.is_new:
      out += 'N'
    else:
      out += ' '
    if self.source_filename:
      out += 'R'
    else:
      out += ' '
103 104
    out += '  '
    if self.source_filename:
105 106
      out += '%s->' % self.source_filename_utf8
    return out + self.filename_utf8
107

108 109 110 111
  def dump(self):
    """Dumps itself in a verbose way to help diagnosing."""
    return str(self)

112 113 114 115 116 117

class FilePatchDelete(FilePatchBase):
  """Deletes a file."""
  is_delete = True

  def __init__(self, filename, is_binary):
118
    super(FilePatchDelete, self).__init__(filename)
119 120 121 122 123 124 125
    self.is_binary = is_binary


class FilePatchBinary(FilePatchBase):
  """Content of a new binary file."""
  is_binary = True

126
  def __init__(self, filename, data, svn_properties, is_new):
127
    super(FilePatchBinary, self).__init__(filename)
128 129
    self.data = data
    self.svn_properties = svn_properties or []
130
    self.is_new = is_new
131 132 133 134

  def get(self):
    return self.data

135 136 137
  def __str__(self):
    return str(super(FilePatchBinary, self)) + ' %d bytes' % len(self.data)

138

139 140 141 142 143 144 145 146 147 148 149
class Hunk(object):
  """Parsed hunk data container."""

  def __init__(self, start_src, lines_src, start_dst, lines_dst):
    self.start_src = start_src
    self.lines_src = lines_src
    self.start_dst = start_dst
    self.lines_dst = lines_dst
    self.variation = self.lines_dst - self.lines_src
    self.text = []

150 151 152 153 154
  def __repr__(self):
    return '%s<(%d, %d) to (%d, %d)>' % (
        self.__class__.__name__,
        self.start_src, self.lines_src, self.start_dst, self.lines_dst)

155

156 157 158 159
class FilePatchDiff(FilePatchBase):
  """Patch for a single file."""

  def __init__(self, filename, diff, svn_properties):
160
    super(FilePatchDiff, self).__init__(filename)
161 162
    if not diff:
      self._fail('File doesn\'t have a diff.')
163
    self.diff_header, self.diff_hunks = self._split_header(diff)
164
    self.svn_properties = svn_properties or []
165 166
    self.is_git_diff = self._is_git_diff_header(self.diff_header)
    self.patchlevel = 0
167
    if self.is_git_diff:
168
      self._verify_git_header()
169
    else:
170
      self._verify_svn_header()
171
    self.hunks = self._split_hunks()
172 173
    if self.source_filename and not self.is_new:
      self._fail('If source_filename is set, is_new must be also be set')
174

175 176 177 178 179 180 181
  def get(self, for_git):
    if for_git or not self.source_filename:
      return self.diff_header + self.diff_hunks
    else:
      # patch is stupid. It patches the source_filename instead so get rid of
      # any source_filename reference if needed.
      return (
182 183
          self.diff_header.replace(
              self.source_filename_utf8, self.filename_utf8) +
184
          self.diff_hunks)
185 186

  def set_relpath(self, relpath):
187 188
    old_filename = self.filename_utf8
    old_source_filename = self.source_filename_utf8 or self.filename_utf8
189 190
    super(FilePatchDiff, self).set_relpath(relpath)
    # Update the header too.
191 192
    filename = self.filename_utf8
    source_filename = self.source_filename_utf8 or self.filename_utf8
193 194 195 196 197
    lines = self.diff_header.splitlines(True)
    for i, line in enumerate(lines):
      if line.startswith('diff --git'):
        lines[i] = line.replace(
            'a/' + old_source_filename, source_filename).replace(
198
                'b/' + old_filename, filename)
199 200 201
      elif re.match(r'^\w+ from .+$', line) or line.startswith('---'):
        lines[i] = line.replace(old_source_filename, source_filename)
      elif re.match(r'^\w+ to .+$', line) or line.startswith('+++'):
202
        lines[i] = line.replace(old_filename, filename)
203
    self.diff_header = ''.join(lines)
204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227

  def _split_header(self, diff):
    """Splits a diff in two: the header and the hunks."""
    header = []
    hunks = diff.splitlines(True)
    while hunks:
      header.append(hunks.pop(0))
      if header[-1].startswith('--- '):
        break
    else:
      # Some diff may not have a ---/+++ set like a git rename with no change or
      # a svn diff with only property change.
      pass

    if hunks:
      if not hunks[0].startswith('+++ '):
        self._fail('Inconsistent header')
      header.append(hunks.pop(0))
      if hunks:
        if not hunks[0].startswith('@@ '):
          self._fail('Inconsistent hunk header')

    # Mangle any \\ in the header to /.
    header_lines = ('Index:', 'diff', 'copy', 'rename', '+++', '---')
228
    basename = os.path.basename(self.filename_utf8)
229 230 231 232 233
    for i in xrange(len(header)):
      if (header[i].split(' ', 1)[0] in header_lines or
          header[i].endswith(basename)):
        header[i] = header[i].replace('\\', '/')
    return ''.join(header), ''.join(hunks)
234 235

  @staticmethod
236 237 238 239 240 241 242 243 244
  def _is_git_diff_header(diff_header):
    """Returns True if the diff for a single files was generated with git."""
    # Delete: http://codereview.chromium.org/download/issue6368055_22_29.diff
    # Rename partial change:
    # http://codereview.chromium.org/download/issue6250123_3013_6010.diff
    # Rename no change:
    # http://codereview.chromium.org/download/issue6287022_3001_4010.diff
    return any(l.startswith('diff --git') for l in diff_header.splitlines())

245 246 247 248 249
  def _split_hunks(self):
    """Splits the hunks and does verification."""
    hunks = []
    for line in self.diff_hunks.splitlines(True):
      if line.startswith('@@'):
250
        match = re.match(r'^@@ -([\d,]+) \+([\d,]+) @@.*$', line)
251 252 253
        # File add will result in "-0,0 +1" but file deletion will result in
        # "-1,N +0,0" where N is the number of lines deleted. That's from diff
        # and svn diff. git diff doesn't exhibit this behavior.
254
        # svn diff for a single line file rewrite "@@ -1 +1 @@". Fun.
255
        # "@@ -1 +1,N @@" is also valid where N is the length of the new file.
256 257
        if not match:
          self._fail('Hunk header is unparsable')
258 259 260 261 262
        count = match.group(1).count(',')
        if not count:
          start_src = int(match.group(1))
          lines_src = 1
        elif count == 1:
263
          start_src, lines_src = map(int, match.group(1).split(',', 1))
264
        else:
265 266 267 268 269 270 271
          self._fail('Hunk header is malformed')

        count = match.group(2).count(',')
        if not count:
          start_dst = int(match.group(2))
          lines_dst = 1
        elif count == 1:
272 273
          start_dst, lines_dst = map(int, match.group(2).split(',', 1))
        else:
274
          self._fail('Hunk header is malformed')
275
        new_hunk = Hunk(start_src, lines_src, start_dst, lines_dst)
276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300
        if hunks:
          if new_hunk.start_src <= hunks[-1].start_src:
            self._fail('Hunks source lines are not ordered')
          if new_hunk.start_dst <= hunks[-1].start_dst:
            self._fail('Hunks destination lines are not ordered')
        hunks.append(new_hunk)
        continue
      hunks[-1].text.append(line)

    if len(hunks) == 1:
      if hunks[0].start_src == 0 and hunks[0].lines_src == 0:
        self.is_new = True
      if hunks[0].start_dst == 0 and hunks[0].lines_dst == 0:
        self.is_delete = True

    if self.is_new and self.is_delete:
      self._fail('Hunk header is all 0')

    if not self.is_new and not self.is_delete:
      for hunk in hunks:
        variation = (
            len([1 for i in hunk.text if i.startswith('+')]) -
            len([1 for i in hunk.text if i.startswith('-')]))
        if variation != hunk.variation:
          self._fail(
301 302
              'Hunk header is incorrect: %d vs %d; %r' % (
                variation, hunk.variation, hunk))
303 304 305 306 307 308 309 310 311 312 313 314 315 316
        if not hunk.start_src:
          self._fail(
              'Hunk header start line is incorrect: %d' % hunk.start_src)
        if not hunk.start_dst:
          self._fail(
              'Hunk header start line is incorrect: %d' % hunk.start_dst)
        hunk.start_src -= 1
        hunk.start_dst -= 1
    if self.is_new and hunks:
      hunks[0].start_dst -= 1
    if self.is_delete and hunks:
      hunks[0].start_src -= 1
    return hunks

317 318 319 320 321 322
  def mangle(self, string):
    """Mangle a file path."""
    return '/'.join(string.replace('\\', '/').split('/')[self.patchlevel:])

  def _verify_git_header(self):
    """Sanity checks the header.
323 324 325

    Expects the following format:

326
    <garbage>
327 328
    diff --git (|a/)<filename> (|b/)<filename>
    <similarity>
329 330
    <filemode changes>
    <index>
331 332
    <copy|rename from>
    <copy|rename to>
333 334 335
    --- <filename>
    +++ <filename>

336
    Everything is optional except the diff --git line.
337
    """
338
    lines = self.diff_header.splitlines()
339

340 341 342
    # Verify the diff --git line.
    old = None
    new = None
343
    while lines:
344 345 346
      match = re.match(r'^diff \-\-git (.*?) (.*)$', lines.pop(0))
      if not match:
        continue
347
      if match.group(1).startswith('a/') and match.group(2).startswith('b/'):
348
        self.patchlevel = 1
349 350 351
      old = self.mangle(match.group(1))
      new = self.mangle(match.group(2))

352
      # The rename is about the new file so the old file can be anything.
353
      if new not in (self.filename_utf8, 'dev/null'):
354 355 356 357 358 359 360
        self._fail('Unexpected git diff output name %s.' % new)
      if old == 'dev/null' and new == 'dev/null':
        self._fail('Unexpected /dev/null git diff.')
      break

    if not old or not new:
      self._fail('Unexpected git diff; couldn\'t find git header.')
361

362
    if old not in (self.filename_utf8, 'dev/null'):
363
      # Copy or rename.
364
      self.source_filename = old.decode('utf-8')
365
      self.is_new = True
366

367 368
    last_line = ''

369
    while lines:
370
      line = lines.pop(0)
371
      self._verify_git_header_process_line(lines, line, last_line)
372
      last_line = line
373

374 375
    # Cheap check to make sure the file name is at least mentioned in the
    # 'diff' header. That the only remaining invariant.
376
    if not self.filename_utf8 in self.diff_header:
377
      self._fail('Diff seems corrupted.')
378

379
  def _verify_git_header_process_line(self, lines, line, last_line):
380 381 382
    """Processes a single line of the header.

    Returns True if it should continue looping.
383 384 385

    Format is described to
    http://www.kernel.org/pub/software/scm/git/docs/git-diff.html
386 387
    """
    match = re.match(r'^(rename|copy) from (.+)$', line)
388
    old = self.source_filename_utf8 or self.filename_utf8
389 390 391 392 393 394 395 396 397 398 399
    if match:
      if old != match.group(2):
        self._fail('Unexpected git diff input name for line %s.' % line)
      if not lines or not lines[0].startswith('%s to ' % match.group(1)):
        self._fail(
            'Confused %s from/to git diff for line %s.' %
                (match.group(1), line))
      return

    match = re.match(r'^(rename|copy) to (.+)$', line)
    if match:
400
      if self.filename_utf8 != match.group(2):
401 402 403 404 405 406 407
        self._fail('Unexpected git diff output name for line %s.' % line)
      if not last_line.startswith('%s from ' % match.group(1)):
        self._fail(
            'Confused %s from/to git diff for line %s.' %
                (match.group(1), line))
      return

408 409 410 411 412 413 414
    match = re.match(r'^deleted file mode (\d{6})$', line)
    if match:
      # It is necessary to parse it because there may be no hunk, like when the
      # file was empty.
      self.is_delete = True
      return

415
    match = re.match(r'^new(| file) mode (\d{6})$', line)
416
    if match:
417
      mode = match.group(2)
418
      # Only look at owner ACL for executable.
419
      if bool(int(mode[4]) & 1):
420
        self.svn_properties.append(('svn:executable', '.'))
421 422 423
      elif not self.source_filename and self.is_new:
        # It's a new file, not from a rename/copy, then there's no property to
        # delete.
424
        self.svn_properties.append(('svn:executable', None))
425
      return
426 427 428 429 430

    match = re.match(r'^--- (.*)$', line)
    if match:
      if last_line[:3] in ('---', '+++'):
        self._fail('--- and +++ are reversed')
431 432 433 434
      if match.group(1) == '/dev/null':
        self.is_new = True
      elif self.mangle(match.group(1)) != old:
        # git patches are always well formatted, do not allow random filenames.
435
        self._fail('Unexpected git diff: %s != %s.' % (old, match.group(1)))
436
      if not lines or not lines[0].startswith('+++'):
437
        self._fail('Missing git diff output name.')
438 439 440 441 442
      return

    match = re.match(r'^\+\+\+ (.*)$', line)
    if match:
      if not last_line.startswith('---'):
443
        self._fail('Unexpected git diff: --- not following +++.')
444 445
      if '/dev/null' == match.group(1):
        self.is_delete = True
446
      elif self.filename_utf8 != self.mangle(match.group(1)):
447 448
        self._fail(
            'Unexpected git diff: %s != %s.' % (self.filename, match.group(1)))
449 450 451 452
      if lines:
        self._fail('Crap after +++')
      # We're done.
      return
453 454 455 456 457 458 459 460 461

  def _verify_svn_header(self):
    """Sanity checks the header.

    A svn diff can contain only property changes, in that case there will be no
    proper header. To make things worse, this property change header is
    localized.
    """
    lines = self.diff_header.splitlines()
462 463
    last_line = ''

464
    while lines:
465 466 467 468 469 470
      line = lines.pop(0)
      self._verify_svn_header_process_line(lines, line, last_line)
      last_line = line

    # Cheap check to make sure the file name is at least mentioned in the
    # 'diff' header. That the only remaining invariant.
471
    if not self.filename_utf8 in self.diff_header:
472 473 474 475 476 477 478 479 480 481 482
      self._fail('Diff seems corrupted.')

  def _verify_svn_header_process_line(self, lines, line, last_line):
    """Processes a single line of the header.

    Returns True if it should continue looping.
    """
    match = re.match(r'^--- ([^\t]+).*$', line)
    if match:
      if last_line[:3] in ('---', '+++'):
        self._fail('--- and +++ are reversed')
483 484
      if match.group(1) == '/dev/null':
        self.is_new = True
485
      elif self.mangle(match.group(1)) != self.filename_utf8:
486
        # guess the source filename.
487
        self.source_filename = match.group(1).decode('utf-8')
488
        self.is_new = True
489
      if not lines or not lines[0].startswith('+++'):
490
        self._fail('Nothing after header.')
491 492 493 494 495
      return

    match = re.match(r'^\+\+\+ ([^\t]+).*$', line)
    if match:
      if not last_line.startswith('---'):
496
        self._fail('Unexpected diff: --- not following +++.')
497 498
      if match.group(1) == '/dev/null':
        self.is_delete = True
499
      elif self.mangle(match.group(1)) != self.filename_utf8:
500
        self._fail('Unexpected diff: %s.' % match.group(1))
501 502 503 504
      if lines:
        self._fail('Crap after +++')
      # We're done.
      return
505

506 507 508 509
  def dump(self):
    """Dumps itself in a verbose way to help diagnosing."""
    return str(self) + '\n' + self.get(True)

510 511 512 513 514

class PatchSet(object):
  """A list of FilePatch* objects."""

  def __init__(self, patches):
515
    for p in patches:
516
      assert isinstance(p, FilePatchBase)
517

518 519 520 521 522 523
    def key(p):
      """Sort by ordering of application.

      File move are first.
      Deletes are last.
      """
524 525 526 527 528 529 530
      # The bool is necessary because None < 'string' but the reverse is needed.
      return (
          p.is_delete,
          # False is before True, so files *with* a source file will be first.
          not bool(p.source_filename),
          p.source_filename_utf8,
          p.filename_utf8)
531 532 533

    self.patches = sorted(patches, key=key)

534 535 536 537 538
  def set_relpath(self, relpath):
    """Used to offset the patch into a subdirectory."""
    for patch in self.patches:
      patch.set_relpath(relpath)

539 540 541 542
  def __iter__(self):
    for patch in self.patches:
      yield patch

543 544 545
  def __getitem__(self, key):
    return self.patches[key]

546 547 548
  @property
  def filenames(self):
    return [p.filename for p in self.patches]