callstats.py 19 KB
Newer Older
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47
#!/usr/bin/env python
# Copyright 2016 the V8 project authors. All rights reserved.
# Use of this source code is governed by a BSD-style license that can be
# found in the LICENSE file.
'''
Usage: runtime-call-stats.py [-h] <command> ...

Optional arguments:
  -h, --help  show this help message and exit

Commands:
  run         run chrome with --runtime-call-stats and generate logs
  stats       process logs and print statistics
  json        process logs from several versions and generate JSON
  help        help information

For each command, you can try ./runtime-call-stats.py help command.
'''

import argparse
import json
import os
import re
import shutil
import subprocess
import sys
import tempfile

import numpy
import scipy
import scipy.stats
from math import sqrt


# Run benchmarks.

def print_command(cmd_args):
  def fix_for_printing(arg):
    m = re.match(r'^--([^=]+)=(.*)$', arg)
    if m and (' ' in m.group(2) or m.group(2).startswith('-')):
      arg = "--{}='{}'".format(m.group(1), m.group(2))
    elif ' ' in arg:
      arg = "'{}'".format(arg)
    return arg
  print " ".join(map(fix_for_printing, cmd_args))


48 49 50 51
def start_replay_server(args, sites):
  with tempfile.NamedTemporaryFile(prefix='callstats-inject-', suffix='.js',
                                   mode='wt', delete=False) as f:
    injection = f.name
52
    generate_injection(f, sites, args.refresh)
53 54 55 56 57 58 59
  cmd_args = [
      args.replay_bin,
      "--port=4080",
      "--ssl_port=4443",
      "--no-dns_forwarding",
      "--use_closest_match",
      "--no-diff_unknown_requests",
60
      "--inject_scripts=deterministic.js,{}".format(injection),
61 62 63 64 65 66 67 68
      args.replay_wpr,
  ]
  print "=" * 80
  print_command(cmd_args)
  with open(os.devnull, 'w') as null:
    server = subprocess.Popen(cmd_args, stdout=null, stderr=null)
  print "RUNNING REPLAY SERVER: %s with PID=%s" % (args.replay_bin, server.pid)
  print "=" * 80
69
  return {'process': server, 'injection': injection}
70 71 72


def stop_replay_server(server):
73 74 75 76 77
  print("SHUTTING DOWN REPLAY SERVER %s" % server['process'].pid)
  server['process'].terminate()
  os.remove(server['injection'])


78
def generate_injection(f, sites, refreshes=0):
79 80
  print >> f, """\
(function() {
81 82 83 84 85 86 87 88
  let s = window.sessionStorage.getItem("refreshCounter");
  let refreshTotal = """, refreshes, """;
  let refreshCounter = s ? parseInt(s) : refreshTotal;
  let refreshId = refreshTotal - refreshCounter;
  if (refreshCounter > 0) {
    window.sessionStorage.setItem("refreshCounter", refreshCounter-1);
  }

89 90 91 92 93 94 95 96 97 98 99 100 101 102
  function match(url, item) {
    if ('regexp' in item) return url.match(item.regexp) !== null;
    let url_wanted = item.url;
    // Allow automatic redirections from http to https.
    if (url_wanted.startsWith("http://") && url.startsWith("https://")) {
      url_wanted = "https://" + url_wanted.substr(7);
    }
    return url.startsWith(url_wanted);
  };

  function onLoad(e) {
    let url = e.target.URL;
    for (let item of sites) {
      if (!match(url, item)) continue;
103
      let timeout = 'timeline' in item ? 2500 * item.timeline
104 105 106
                  : 'timeout'  in item ? 1000 * (item.timeout - 3)
                  : 10000;
      console.log("Setting time out of " + timeout + " for: " + url);
107
      window.setTimeout(function() {
108
        console.log("Time is out for: " + url);
109 110 111 112 113 114
        let msg = "STATS: (" + refreshId + ") " + url;
        %GetAndResetRuntimeCallStats(1, msg);
        if (refreshCounter > 0) {
          console.log("Refresh counter is " + refreshCounter + ", refreshing: " + url);
          window.location.reload();
        }
115 116 117 118 119 120 121 122 123 124 125 126
      }, timeout);
      return;
    }
    console.log("Ignoring: " + url);
  };

  let sites =
    """, json.dumps(sites), """;

  console.log("Event listenner added for: " + window.location.href);
  window.addEventListener("load", onLoad);
})();"""
127 128 129 130 131 132 133 134


def run_site(site, domain, args, timeout=None):
  print "="*80
  print "RUNNING DOMAIN %s" % domain
  print "="*80
  result_template = "{domain}#{count}.txt" if args.repeat else "{domain}.txt"
  count = 0
135
  if timeout is None: timeout = args.timeout
136 137 138
  if args.replay_wpr:
    timeout *= 1 + args.refresh
    timeout += 1
139 140 141 142 143 144 145
  while count == 0 or args.repeat is not None and count < args.repeat:
    count += 1
    result = result_template.format(domain=domain, count=count)
    retries = 0
    while args.retries is None or retries < args.retries:
      retries += 1
      try:
146 147 148
        if args.user_data_dir:
          user_data_dir = args.user_data_dir
        else:
149 150
          user_data_dir = tempfile.mkdtemp(prefix="chr_")
        js_flags = "--runtime-call-stats"
151
        if args.replay_wpr: js_flags += " --allow-natives-syntax"
152 153 154 155 156 157 158 159 160 161 162 163 164 165
        if args.js_flags: js_flags += " " + args.js_flags
        chrome_flags = [
            "--no-default-browser-check",
            "--disable-translate",
            "--js-flags={}".format(js_flags),
            "--no-first-run",
            "--user-data-dir={}".format(user_data_dir),
        ]
        if args.replay_wpr:
          chrome_flags += [
              "--host-resolver-rules=MAP *:80 localhost:4080, "  \
                                    "MAP *:443 localhost:4443, " \
                                    "EXCLUDE localhost",
              "--ignore-certificate-errors",
166
              "--disable-seccomp-sandbox",
167 168 169 170
              "--disable-web-security",
              "--reduce-security-for-testing",
              "--allow-insecure-localhost",
          ]
171 172
        else:
          chrome_flags += [
173
              "--single-process",
174
          ]
175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198
        if args.chrome_flags:
          chrome_flags += args.chrome_flags.split()
        cmd_args = [
            "timeout", str(timeout),
            args.with_chrome
        ] + chrome_flags + [ site ]
        print "- " * 40
        print_command(cmd_args)
        print "- " * 40
        with open(result, "wt") as f:
          status = subprocess.call(cmd_args, stdout=f)
        # 124 means timeout killed chrome, 0 means the user was bored first!
        # If none of these two happened, then chrome apparently crashed, so
        # it must be called again.
        if status != 124 and status != 0:
          print("CHROME CRASHED, REPEATING RUN");
          continue
        # If the stats file is empty, chrome must be called again.
        if os.path.isfile(result) and os.path.getsize(result) > 0:
          if args.print_url:
            with open(result, "at") as f:
              print >> f
              print >> f, "URL: {}".format(site)
          break
199
        if retries <= 6: timeout += 2 ** (retries-1)
200 201
        print("EMPTY RESULT, REPEATING RUN");
      finally:
202
        if not args.user_data_dir:
203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
          shutil.rmtree(user_data_dir)


def read_sites_file(args):
  try:
    sites = []
    try:
      with open(args.sites_file, "rt") as f:
        for item in json.load(f):
          if 'timeout' not in item:
            # This is more-or-less arbitrary.
            item['timeout'] = int(2.5 * item['timeline'] + 3)
          if item['timeout'] > args.timeout: item['timeout'] = args.timeout
          sites.append(item)
    except ValueError:
      with open(args.sites_file, "rt") as f:
        for line in f:
          line = line.strip()
          if not line or line.startswith('#'): continue
          sites.append({'url': line, 'timeout': args.timeout})
    return sites
  except IOError as e:
    args.error("Cannot read from {}. {}.".format(args.sites_file, e.strerror))
    sys.exit(1)


def do_run(args):
  # Determine the websites to benchmark.
  if args.sites_file:
    sites = read_sites_file(args)
  else:
234
    sites = [{'url': site, 'timeout': args.timeout} for site in args.sites]
235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254
  # Disambiguate domains, if needed.
  L = []
  domains = {}
  for item in sites:
    site = item['url']
    m = re.match(r'^(https?://)?([^/]+)(/.*)?$', site)
    if not m:
      args.error("Invalid URL {}.".format(site))
      continue
    domain = m.group(2)
    entry = [site, domain, None, item['timeout']]
    if domain not in domains:
      domains[domain] = entry
    else:
      if not isinstance(domains[domain], int):
        domains[domain][2] = 1
        domains[domain] = 1
      domains[domain] += 1
      entry[2] = domains[domain]
    L.append(entry)
255
  replay_server = start_replay_server(args, sites) if args.replay_wpr else None
256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
  try:
    # Run them.
    for site, domain, count, timeout in L:
      if count is not None: domain = "{}%{}".format(domain, count)
      print site, domain, timeout
      run_site(site, domain, args, timeout)
  finally:
    if replay_server:
      stop_replay_server(replay_server)


# Calculate statistics.

def statistics(data):
  N = len(data)
  average = numpy.average(data)
  median = numpy.median(data)
  low = numpy.min(data)
  high= numpy.max(data)
  if N > 1:
    # evaluate sample variance by setting delta degrees of freedom (ddof) to
    # 1. The degree used in calculations is N - ddof
    stddev = numpy.std(data, ddof=1)
    # Get the endpoints of the range that contains 95% of the distribution
    t_bounds = scipy.stats.t.interval(0.95, N-1)
    #assert abs(t_bounds[0] + t_bounds[1]) < 1e-6
    # sum mean to the confidence interval
    ci = {
        'abs': t_bounds[1] * stddev / sqrt(N),
        'low': average + t_bounds[0] * stddev / sqrt(N),
        'high': average + t_bounds[1] * stddev / sqrt(N)
    }
  else:
    stddev = 0
    ci = { 'abs': 0, 'low': average, 'high': average }
  if abs(stddev) > 0.0001 and abs(average) > 0.0001:
    ci['perc'] = t_bounds[1] * stddev / sqrt(N) / average * 100
  else:
    ci['perc'] = 0
  return { 'samples': N, 'average': average, 'median': median,
           'stddev': stddev, 'min': low, 'max': high, 'ci': ci }


def read_stats(path, S):
  with open(path, "rt") as f:
    # Process the whole file and sum repeating entries.
    D = { 'Sum': {'time': 0, 'count': 0} }
    for line in f:
      line = line.strip()
      # Discard headers and footers.
      if not line: continue
      if line.startswith("Runtime Function"): continue
      if line.startswith("===="): continue
      if line.startswith("----"): continue
      if line.startswith("URL:"): continue
311
      if line.startswith("STATS:"): continue
312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418
      # We have a regular line.
      fields = line.split()
      key = fields[0]
      time = float(fields[1].replace("ms", ""))
      count = int(fields[3])
      if key not in D: D[key] = { 'time': 0, 'count': 0 }
      D[key]['time'] += time
      D[key]['count'] += count
      # We calculate the sum, if it's not the "total" line.
      if key != "Total":
        D['Sum']['time'] += time
        D['Sum']['count'] += count
    # Append the sums as single entries to S.
    for key in D:
      if key not in S: S[key] = { 'time_list': [], 'count_list': [] }
      S[key]['time_list'].append(D[key]['time'])
      S[key]['count_list'].append(D[key]['count'])


def print_stats(S, args):
  # Sort by ascending/descending time average, then by ascending/descending
  # count average, then by ascending name.
  def sort_asc_func(item):
    return (item[1]['time_stat']['average'],
            item[1]['count_stat']['average'],
            item[0])
  def sort_desc_func(item):
    return (-item[1]['time_stat']['average'],
            -item[1]['count_stat']['average'],
            item[0])
  # Sorting order is in the commend-line arguments.
  sort_func = sort_asc_func if args.sort == "asc" else sort_desc_func
  # Possibly limit how many elements to print.
  L = [item for item in sorted(S.items(), key=sort_func)
       if item[0] not in ["Total", "Sum"]]
  N = len(L)
  if args.limit == 0:
    low, high = 0, N
  elif args.sort == "desc":
    low, high = 0, args.limit
  else:
    low, high = N-args.limit, N
  # How to print entries.
  def print_entry(key, value):
    def stats(s, units=""):
      conf = "{:0.1f}({:0.2f}%)".format(s['ci']['abs'], s['ci']['perc'])
      return "{:8.1f}{} +/- {:15s}".format(s['average'], units, conf)
    print "{:>50s}  {}  {}".format(
      key,
      stats(value['time_stat'], units="ms"),
      stats(value['count_stat'])
    )
  # Print and calculate partial sums, if necessary.
  for i in range(low, high):
    print_entry(*L[i])
    if args.totals and args.limit != 0:
      if i == low:
        partial = { 'time_list': [0] * len(L[i][1]['time_list']),
                    'count_list': [0] * len(L[i][1]['count_list']) }
      assert len(partial['time_list']) == len(L[i][1]['time_list'])
      assert len(partial['count_list']) == len(L[i][1]['count_list'])
      for j, v in enumerate(L[i][1]['time_list']):
        partial['time_list'][j] += v
      for j, v in enumerate(L[i][1]['count_list']):
        partial['count_list'][j] += v
  # Print totals, if necessary.
  if args.totals:
    print '-' * 80
    if args.limit != 0:
      partial['time_stat'] = statistics(partial['time_list'])
      partial['count_stat'] = statistics(partial['count_list'])
      print_entry("Partial", partial)
    print_entry("Sum", S["Sum"])
    print_entry("Total", S["Total"])


def do_stats(args):
  T = {}
  for path in args.logfiles:
    filename = os.path.basename(path)
    m = re.match(r'^([^#]+)(#.*)?$', filename)
    domain = m.group(1)
    if domain not in T: T[domain] = {}
    read_stats(path, T[domain])
  for i, domain in enumerate(sorted(T)):
    if len(T) > 1:
      if i > 0: print
      print "{}:".format(domain)
      print '=' * 80
    S = T[domain]
    for key in S:
      S[key]['time_stat'] = statistics(S[key]['time_list'])
      S[key]['count_stat'] = statistics(S[key]['count_list'])
    print_stats(S, args)


# Generate JSON file.

def do_json(args):
  J = {}
  for path in args.logdirs:
    if os.path.isdir(path):
      for root, dirs, files in os.walk(path):
        version = os.path.basename(root)
        if version not in J: J[version] = {}
        for filename in files:
          if filename.endswith(".txt"):
419
            m = re.match(r'^([^#]+)(#.*)?\.txt$', filename)
420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453
            domain = m.group(1)
            if domain not in J[version]: J[version][domain] = {}
            read_stats(os.path.join(root, filename), J[version][domain])
  for version, T in J.items():
    for domain, S in T.items():
      A = []
      for name, value in S.items():
        # We don't want the calculated sum in the JSON file.
        if name == "Sum": continue
        entry = [name]
        for x in ['time_list', 'count_list']:
          s = statistics(S[name][x])
          entry.append(round(s['average'], 1))
          entry.append(round(s['ci']['abs'], 1))
          entry.append(round(s['ci']['perc'], 2))
        A.append(entry)
      T[domain] = A
  print json.dumps(J, separators=(',', ':'))


# Help.

def do_help(parser, subparsers, args):
  if args.help_cmd:
    if args.help_cmd in subparsers:
      subparsers[args.help_cmd].print_help()
    else:
      args.error("Unknown command '{}'".format(args.help_cmd))
  else:
    parser.print_help()


# Main program, parse command line and execute.

454 455 456 457
def coexist(*l):
  given = sum(1 for x in l if x)
  return given == 0 or given == len(l)

458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479
def main():
  parser = argparse.ArgumentParser()
  subparser_adder = parser.add_subparsers(title="commands", dest="command",
                                          metavar="<command>")
  subparsers = {}
  # Command: run.
  subparsers["run"] = subparser_adder.add_parser(
      "run", help="run --help")
  subparsers["run"].set_defaults(
      func=do_run, error=subparsers["run"].error)
  subparsers["run"].add_argument(
      "--chrome-flags", type=str, default="",
      help="specify additional chrome flags")
  subparsers["run"].add_argument(
      "--js-flags", type=str, default="",
      help="specify additional V8 flags")
  subparsers["run"].add_argument(
      "--no-url", dest="print_url", action="store_false", default=True,
      help="do not include url in statistics file")
  subparsers["run"].add_argument(
      "-n", "--repeat", type=int, metavar="<num>",
      help="specify iterations for each website (default: once)")
480 481 482
  subparsers["run"].add_argument(
      "-k", "--refresh", type=int, metavar="<num>", default=0,
      help="specify refreshes for each iteration (default: 0)")
483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544
  subparsers["run"].add_argument(
      "--replay-wpr", type=str, metavar="<path>",
      help="use the specified web page replay (.wpr) archive")
  subparsers["run"].add_argument(
      "--replay-bin", type=str, metavar="<path>",
      help="specify the replay.py script typically located in " \
           "$CHROMIUM/src/third_party/webpagereplay/replay.py")
  subparsers["run"].add_argument(
      "-r", "--retries", type=int, metavar="<num>",
      help="specify retries if website is down (default: forever)")
  subparsers["run"].add_argument(
      "-f", "--sites-file", type=str, metavar="<path>",
      help="specify file containing benchmark websites")
  subparsers["run"].add_argument(
      "-t", "--timeout", type=int, metavar="<seconds>", default=60,
      help="specify seconds before chrome is killed")
  subparsers["run"].add_argument(
      "-u", "--user-data-dir", type=str, metavar="<path>",
      help="specify user data dir (default is temporary)")
  subparsers["run"].add_argument(
      "-c", "--with-chrome", type=str, metavar="<path>",
      default="/usr/bin/google-chrome",
      help="specify chrome executable to use")
  subparsers["run"].add_argument(
      "sites", type=str, metavar="<URL>", nargs="*",
      help="specify benchmark website")
  # Command: stats.
  subparsers["stats"] = subparser_adder.add_parser(
      "stats", help="stats --help")
  subparsers["stats"].set_defaults(
      func=do_stats, error=subparsers["stats"].error)
  subparsers["stats"].add_argument(
      "-l", "--limit", type=int, metavar="<num>", default=0,
      help="limit how many items to print (default: none)")
  subparsers["stats"].add_argument(
      "-s", "--sort", choices=["asc", "desc"], default="asc",
      help="specify sorting order (default: ascending)")
  subparsers["stats"].add_argument(
      "-n", "--no-total", dest="totals", action="store_false", default=True,
      help="do not print totals")
  subparsers["stats"].add_argument(
      "logfiles", type=str, metavar="<logfile>", nargs="*",
      help="specify log files to parse")
  # Command: json.
  subparsers["json"] = subparser_adder.add_parser(
      "json", help="json --help")
  subparsers["json"].set_defaults(
      func=do_json, error=subparsers["json"].error)
  subparsers["json"].add_argument(
      "logdirs", type=str, metavar="<logdir>", nargs="*",
      help="specify directories with log files to parse")
  # Command: help.
  subparsers["help"] = subparser_adder.add_parser(
      "help", help="help information")
  subparsers["help"].set_defaults(
      func=lambda args: do_help(parser, subparsers, args),
      error=subparsers["help"].error)
  subparsers["help"].add_argument(
      "help_cmd", type=str, metavar="<command>", nargs="?",
      help="command for which to display help")
  # Execute the command.
  args = parser.parse_args()
545 546 547
  setattr(args, 'script_path', os.path.dirname(sys.argv[0]))
  if args.command == "run" and coexist(args.sites_file, args.sites):
    args.error("use either option --sites-file or site URLs")
548
    sys.exit(1)
549 550
  elif args.command == "run" and not coexist(args.replay_wpr, args.replay_bin):
    args.error("options --replay-wpr and --replay-bin must be used together")
551 552 553 554 555 556
    sys.exit(1)
  else:
    args.func(args)

if __name__ == "__main__":
  sys.exit(main())