scripts/babysitter

   1 #!/usr/bin/env python
   2
   3 # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
   4
   5 import argparse
   6 import subprocess
   7 import re
   8 import signal
   9 import time
  10 import sys
  11 import os.path
  12 import copy
  13 import tempfile
  14 import calendar
  15 import json
  16
  17 ### Constants
  18
  19 # Here is how the communication with nunit works. It has to work with two constraints:
  20 # - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
  21 #   and at some point down the line someone calls nunit.
  22 # - nunit has to be able to report back to us even if (especially if) it terminates improperly.
  23 # To deal with all this, communication babysitter->nunit is done by environment variables,
  24 # and communication nunit->babysitter is done by leaving behind files in known locations.
  25
  26 # Filenames
  27
  28 CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
  29 RAN_TEST_FILE     = "babysitter_report_ran_test_file.txt"
  30 FAILED_TEST_FILE  = "babysitter_report_failed_test_file.txt"
  31 LOGGING_FILE      = "babysitter_report.json_lines"
  32
  33 # Environment keys
  34
  35 # Keys used for Babysitter<->Nunit IPC
  36 CURRENT_TEST_KEY  = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
  37 RAN_TEST_KEY      = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
  38 FAILED_TEST_KEY   = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
  39 RUN_KEY           = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
  40 RUN_MODE_KEY      = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
  41
  42 # Keys used for script configuration (see --help text)
  43 LOG_FILE_KEY      = 'MONO_BABYSITTER_LOG_FILE'       # Path
  44 RETRY_KEY         = 'MONO_BABYSITTER_RETRY'          # Equal to an integer
  45 VERBOSE_KEY       = 'MONO_BABYSITTER_VERBOSE'        # "Undocumented"-- used for debugging babysitter
  46
  47 # JSON keys
  48
  49 DATE_JSON       = 'date'          # POSIX timestamp of test suite run
  50 INVOKE_JSON     = 'invocation'
  51 COUNT_JSON      = 'iteration'        # How many times was command executed?
  52 LIMIT_JSON      = 'failure_max'
  53 SUPPORT_JSON    = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
  54 FINAL_CODE_JSON = 'final_code'
  55 TESTS_JSON      = 'tests'         # Holds dictionary of (test case name)->(dict with TEST_ keys below)
  56 TEST_FAILURES         = 'normal_failures'
  57 TEST_CRASH_FAILURES   = 'crash_failures'
  58 TEST_TIMEOUT_FAILURES = 'timeout_failures'
  59
  60 ### Interpret arguments
  61
  62 scriptname = sys.argv[0]
  63
  64 # This is very silly: This program needs to switch from argparse to manual parsing
  65 # after the second positional argument, but argparse doesn't let you do this.
  66 # I work around this with a custom subclass which can selectively swallow errors:
  67 class Hesitate(Exception):
  68         pass
  69 class HesitantParser(argparse.ArgumentParser):
  70         def __init__(s, *args, **kwargs):
  71                 s.hesitating = True  # Errors will be swallowed until this is set false
  72                 argparse.ArgumentParser.__init__(s, *args, **kwargs)
  73         def error(s, *args, **kwargs):
  74                 if s.hesitating:
  75                         raise Hesitate() # Bail out before errors are printed.
  76                 argparse.ArgumentParser.error(s, *args, **kwargs)
  77
  78 # Define args
  79 argparser = HesitantParser(description="""\
  80 Run a test suite with a timeout.\n
  81 Durations are floating point numbers followed by an optional unit:\n
  82 's' for seconds (the default)
  83 'm' for minutes
  84 'h' for hours
  85 'd' for days\n
  86 supported environment variables:
  87   %s: File to write logs to (as line-delimited JSON)
  88   %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
  89                 (LOG_FILE_KEY, RETRY_KEY),
  90         formatter_class=argparse.RawTextHelpFormatter)
  91 argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
  92         help="Send this signal to the command on timeout, instead of TERM.")
  93 argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
  94         help="If process continues running after signal, send KILL after this duration.")
  95 argparser.add_argument('duration',
  96         help="Time to run before sending signal.")
  97 argparser.add_argument('command', nargs="+", help="Command+args to run.")
  98
  99 # Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
 100 argc = len(sys.argv)
 101 extra_args = []
 102 for limit in range(1,argc+1):
 103         try:
 104                 if limit == argc: # On the final pass, parse for real
 105                         argparser.hesitating = False
 106                 args = argparser.parse_args(sys.argv[1:limit])
 107                 # If we're still here, parse_args succeeded.
 108                 # The final parsed arg is the command; remaining argv items are its args.
 109                 extra_args = sys.argv[limit:]
 110                 break
 111         except Hesitate: # Before the final pass, if parse_args fails, skip
 112                 pass
 113
 114 argparser.hesitating = False # Just act like a normal argparser from here
 115
 116 durationre = re.compile(r'(\d+)([smhd]?)')
 117 def parse_duration(duration): # Accept units
 118         match = durationre.match(duration)
 119         if not match:
 120                 argparser.error("Could not understand duration %s" % duration)
 121         time, units = match.group(1), match.group(2)
 122         time = int(time)
 123         if units == 'm':
 124                 time *= 60
 125         elif units == 'h':
 126                 time *= 60*60
 127         elif units == 'd':
 128                 time *= 60*60*24
 129         return time
 130
 131 def parse_signal(sig):        # Accept names
 132         if sig.isdigit():
 133                 return int(sig)
 134         for k,v in signal.__dict__.iteritems():
 135                 if k == ("SIG%s" % sig):
 136                         return v
 137         argparser.error("Could not understand signal name %s" % sig)
 138
 139 # Final interpretation of arguments
 140 duration = parse_duration(args.duration)
 141 kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
 142 timeout_signal = parse_signal(args.signal)
 143 command = args.command + extra_args
 144
 145 # Process environment
 146 global_env = copy.deepcopy( os.environ )
 147
 148 verbose = VERBOSE_KEY in global_env
 149 logging = LOG_FILE_KEY in global_env
 150 logfile = global_env[LOG_FILE_KEY] if logging else None
 151 crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
 152 failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
 153 babysitting = logging or failmax
 154 if babysitting:
 155         babysitter_dir = tempfile.mkdtemp()
 156         global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
 157         global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
 158         global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
 159
 160 have_unix_process_groups = 'killpg' in os.__dict__
 161 have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
 162
 163 ### Timeout implementation
 164
 165 def wait(proc, duration):
 166         # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
 167         start = time.time()
 168         while True:
 169                 code = proc.poll()
 170                 if code is not None:
 171                         return code
 172                 if time.time()-start > duration:
 173                         return None
 174                 time.sleep(0.05)
 175
 176 # Popen and send_signal can't be called in their basic forms because we want to
 177 # send signals to all children, not just to the immediately spawned process.
 178 # Unfortunately the way to do this varies by operating system.
 179 def popen(*args, **kwargs):
 180         if have_unix_process_groups: # Call function on spawn to become process group leader
 181                 kwargs['preexec_fn'] = os.setsid
 182         elif have_windows_process_groups: # Set magic flag for Windows process groups
 183                 kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
 184         return subprocess.Popen(*args, **kwargs)
 185
 186 def send_signal(proc, sig):
 187         if have_unix_process_groups: # UNIX
 188                 # For compatibility with GNU timeout, pre-send the signal to just the monitored process
 189                 os.kill(proc.pid, sig)
 190                 # Send signal to entire group
 191                 os.killpg(proc.pid, sig)
 192                 # For compatibility with GNU Timeout, send a SIGCONT after the signal
 193                 # (so delivery has a chance to occur even for stopped processes)
 194                 if sig != signal.SIGKILL and sig != signal.SIGCONT:
 195                         os.kill(proc.pid, signal.SIGCONT)
 196         elif have_windows_process_groups: # Windows with Python 2.7 or better
 197                 os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
 198         else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
 199                 proc.send_signal(sig) # No way to contact group, just kill process
 200
 201 ### Utility functions
 202
 203 def attemptDelete(path):
 204         try:
 205                 os.remove(path)
 206         except OSError:
 207                 pass
 208
 209 def attemptLines(path):
 210         try:
 211                 with open(path) as f:
 212                         return map(lambda s: s.strip('\r\n'), f.readlines())
 213         except (OSError, IOError):
 214                 return []
 215
 216 def attemptFirstLine(path):
 217         lines = attemptLines(path)
 218         if lines:
 219                 return lines[0]
 220         return None
 221
 222 def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
 223         return calendar.timegm(time.gmtime())
 224
 225 failcount = {}
 226 def failure_may_retry(test):
 227         if test not in failcount:
 228                 failcount[test] = 0
 229         failcount[test] += 1
 230         return failcount[test] < failmax
 231
 232 def verbose_print(arg):
 233         if (verbose):
 234                 print(arg)
 235
 236 def failure_annotate(test):
 237         return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
 238
 239 def pluralize(lst):
 240         return "s" if len(lst) > 1 else ""
 241
 242 ### Run command
 243
 244 def run(): # Returns exit code
 245         resume_after = []
 246         retry_these = []
 247         ever_completed = False
 248         died_politely = False
 249         proc = None
 250         code = None
 251
 252         # Set up logging
 253         if logging:
 254                 log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
 255                         INVOKE_JSON: " ".join(command)}
 256         else:
 257                 log = None
 258
 259         def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
 260                 if not logging:
 261                         return
 262                 if add is not None:
 263                         if key not in target:
 264                                 target[key] = 0
 265                         target[key] += add
 266                 else:
 267                         target[key] = set
 268
 269         def log_test(testname, key, set=None, add=None):   # Call to add test-case-level value to log
 270                 if not logging:
 271                         return
 272                 if TESTS_JSON not in log:
 273                         log[TESTS_JSON] = {}
 274                 if testname not in log[TESTS_JSON]:
 275                         log[TESTS_JSON][testname] = {}
 276                 log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
 277
 278         # Ready to run tests
 279         try:
 280                 while True:
 281                         env = copy.copy(global_env)
 282                         if ever_completed:
 283                                 retry_next = []
 284                         else: # Persist reported failures list until first non-crash run
 285                                 retry_next = retry_these
 286
 287                         log_value(COUNT_JSON, add=1)
 288
 289                         # Prepare environment/filesystem
 290                         if babysitting:
 291                                 for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
 292                                         attemptDelete(env[key])
 293                                 if resume_after:
 294                                         env[RUN_KEY] = ";".join(resume_after)
 295                                         env[RUN_MODE_KEY] = "EXCLUDE"
 296                                 elif retry_these:
 297                                         env[RUN_KEY] = ";".join(retry_these)
 298                                         env[RUN_MODE_KEY] = "RUN"
 299
 300                         # Run test suite
 301                         try:
 302                                 proc = popen(command, env=env)
 303                         except OSError:
 304                                 died_politely = True
 305                                 sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
 306                                 sys.exit(127)
 307
 308                         code = wait(proc, duration)
 309                         timed_out = code is None
 310                         if timed_out:                  # Initial timeout
 311                                 send_signal(proc, timeout_signal)
 312                                 if kill_after is not None: # Kill-after timeout
 313                                         code = wait(proc, kill_after)
 314                                         if code is None:
 315                                                 send_signal(proc, signal.SIGKILL)
 316                                 code = proc.wait()         # Waits forever
 317                                 sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
 318                         died_politely = True
 319
 320                         # The test suite has now run, and what happens next varies:
 321                         # 1. The suite either completed fully without failures, or timed out: Just quit.
 322                         # 2. The suite crashed (halted without completing):
 323                         #   Remember any failures for later and rerun, using a blacklist of testcases we have completed.
 324                         # 3. The suite completed, but there were failures reported:
 325                         #   Rerun, using a whitelist of only reported-failed testcases.
 326                         # 4. The suite crashed partway through a run with a whitelist:
 327                         #   Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
 328
 329                         crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
 330                         failed_tests = attemptLines(env[FAILED_TEST_KEY])
 331                         ran_tests = attemptLines(env[RAN_TEST_KEY])
 332                         bailout = False
 333
 334                         if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
 335                                 log_value(SUPPORT_JSON, True)
 336
 337                         if not crashed_at and not ever_completed:  # The resume_after whitelist is only
 338                                 resume_after = []                      # used before the first noncrashing run
 339                                 ever_completed = True
 340
 341                         if timed_out:       # Currently no retries after timeout
 342                                 bailout = True
 343                                 code = 124      # See GNU timeout manpage
 344
 345                         if (code or crashed_at) and babysitting: # Process failures
 346                                 # Handle crash failures
 347                                 if crashed_at and not timed_out:
 348                                         log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
 349                                         if not crash_resuming:
 350                                                 bailout = True
 351
 352                                         if failure_may_retry(crashed_at):
 353                                                 if ever_completed:           # Rerun with whitelist next time
 354                                                         for test in retry_these: # Prepopulate with last whitelist minus run testcases
 355                                                                 if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
 356                                                                         retry_next.append(test)
 357                                                 else:                            # Rerun with blacklist next time
 358                                                         for test in ran_tests:       # Add testcases we just ran to blacklist
 359                                                                 if test != crashed_at:   # (except for the crashed testcase)
 360                                                                         resume_after.append(test)
 361                                         else:
 362                                                 bailout = True
 363
 364                                 # Handle reported failures
 365                                 for test in failed_tests:
 366                                         log_test(test, TEST_FAILURES, add=1)
 367                                         if failure_may_retry(test):
 368                                                 retry_next.append(test)
 369                                         else:
 370                                                 bailout = True
 371
 372                                 # Report human-readable failures for stdout log.
 373                                 message = "%s:" % (scriptname)
 374                                 if timed_out:
 375                                         message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
 376                                 elif crashed_at:
 377                                         message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
 378                                 if failed_tests:
 379                                         message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
 380                                 if bailout:
 381                                         message += " Will halt testing."
 382                                 print(message)
 383
 384                         if bailout or not (babysitting and (resume_after or retry_next)): # If not retrying
 385                                 return code
 386
 387                         # If we got here, a retry is occurring
 388                         retry_these = retry_next
 389
 390                         # Report human-readable retry notice for stdout log.
 391                         message = "%s: Will rerun test suite" % (scriptname)
 392                         if log[COUNT_JSON] > 1:
 393                                 message += " (iteration #%d)" % (log[COUNT_JSON])
 394                         if resume_after:
 395                                 message += ", resuming at crashed testcase %s." % (crashed_at)
 396                         else:
 397                                 message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
 398                         print(message)
 399         finally:
 400                 # Emergency: Ensure command does not outlive this script
 401                 if proc is not None and not died_politely:
 402                         send_signal(proc, signal.SIGKILL)
 403
 404                 # Write out logs
 405                 log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
 406                 if logging:
 407                         with open(logfile, "a") as f:
 408                                 f.write(json.dumps(log) + os.linesep)
 409
 410 sys.exit( run() )