[runtime] Fix potential overflow when using mono_msec_ticks
[mono.git] / scripts / babysitter
index f48e07957e89cbaf3a9a73e0a4b0f39d83cfcff9..f8b5a6c345ca1545b6e47e06d44172ce7c6a69b9 100755 (executable)
@@ -1,4 +1,4 @@
-#!/usr/bin/python
+#!/usr/bin/env python
 
 # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
 
@@ -32,25 +32,27 @@ LOGGING_FILE      = "babysitter_report.json_lines"
 
 # Environment keys
 
-CURRENT_TEST_KEY  = 'NUNIT_BABYSITTER_CURRENT_TEST_FILE' # Tell nunit where to leave files
-RAN_TEST_KEY      = 'NUNIT_BABYSITTER_RAN_TEST_FILE'
-FAILED_TEST_KEY   = 'NUNIT_BABYSITTER_FAILED_TEST_FILE'
-RUN_KEY           = 'NUNIT_BABYSITTER_RUN_TEST' # Semicolon-separated list of test names
-RUN_MODE_KEY      = 'NUNIT_BABYSITTER_RUN_MODE' # Equal to either RUN or AFTER
-RETRY_KEY         = 'BABYSITTER_RETRY'          # Equal to an integer
-CRASH_RESUME_KEY  = 'BABYSITTER_CRASH_RESUME'
-VERBOSE_KEY       = 'BABYSITTER_VERBOSE'
-LOGGING_DIR_KEY   = 'WORKSPACE'
+# Keys used for Babysitter<->Nunit IPC
+CURRENT_TEST_KEY  = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
+RAN_TEST_KEY      = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
+FAILED_TEST_KEY   = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
+RUN_KEY           = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
+RUN_MODE_KEY      = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
+
+# Keys used for script configuration (see --help text)
+LOG_FILE_KEY      = 'MONO_BABYSITTER_LOG_FILE'       # Path
+RETRY_KEY         = 'MONO_BABYSITTER_RETRY'          # Equal to an integer
+VERBOSE_KEY       = 'MONO_BABYSITTER_VERBOSE'        # "Undocumented"-- used for debugging babysitter
 
 # JSON keys
 
-DATE_JSON       = 'date'          # POSIX timestamp of test run
+DATE_JSON       = 'date'          # POSIX timestamp of test suite run
 INVOKE_JSON     = 'invocation'
-COUNT_JSON      = 'passes'        # How many times was command executed?
+COUNT_JSON      = 'iteration'        # How many times was command executed?
 LIMIT_JSON      = 'failure_max'
-SUPPORT_JSON    = 'retry_support' # Was the test running with a babysitter-aware nunit?
+SUPPORT_JSON    = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
 FINAL_CODE_JSON = 'final_code'
-TESTS_JSON      = 'tests'         # Holds dictionary of (test name)->(dict with TEST_ keys below)
+TESTS_JSON      = 'tests'         # Holds dictionary of (test case name)->(dict with TEST_ keys below)
 TEST_FAILURES         = 'normal_failures'
 TEST_CRASH_FAILURES   = 'crash_failures'
 TEST_TIMEOUT_FAILURES = 'timeout_failures'
@@ -75,18 +77,16 @@ class HesitantParser(argparse.ArgumentParser):
 
 # Define args
 argparser = HesitantParser(description="""\
-Run a test case with a timeout.\n
+Run a test suite with a timeout.\n
 Durations are floating point numbers followed by an optional unit:\n
 's' for seconds (the default)
 'm' for minutes
 'h' for hours
 'd' for days\n
 supported environment variables:
-  %s: Directory path to save logs into
-  %s: If set to a number, failed tests will be rerun this many times
-  %s: If set, rerun even for tests which terminated early
-  %s: If set, print extra logging during run""" %
-               (LOGGING_DIR_KEY, RETRY_KEY, CRASH_RESUME_KEY, VERBOSE_KEY),
+  %s: File to write logs to (as line-delimited JSON)
+  %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
+               (LOG_FILE_KEY, RETRY_KEY),
        formatter_class=argparse.RawTextHelpFormatter)
 argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
        help="Send this signal to the command on timeout, instead of TERM.")
@@ -146,22 +146,24 @@ command = args.command + extra_args
 global_env = copy.deepcopy( os.environ )
 
 verbose = VERBOSE_KEY in global_env
-logging = LOGGING_DIR_KEY in global_env
-logging_dir = global_env[LOGGING_DIR_KEY] if logging else None
-logfile = os.path.join(logging_dir, LOGGING_FILE) if logging else None
-crash_resuming = CRASH_RESUME_KEY in global_env
+logging = LOG_FILE_KEY in global_env
+logfile = global_env[LOG_FILE_KEY] if logging else None
+crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
 failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
-babysitting = logging or failmax
+babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
 if babysitting:
        babysitter_dir = tempfile.mkdtemp()
        global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
        global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
        global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
 
-### Utility functions
+have_unix_process_groups = 'killpg' in os.__dict__
+have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
+
+### Timeout implementation
 
 def wait(proc, duration):
-       # FIXME: If we can guarantee Python 3.3, Popen objects have a wait(timeout) method
+       # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
        start = time.time()
        while True:
                code = proc.poll()
@@ -171,6 +173,33 @@ def wait(proc, duration):
                        return None
                time.sleep(0.05)
 
+# Popen and send_signal can't be called in their basic forms because we want to
+# send signals to all children, not just to the immediately spawned process.
+# Unfortunately the way to do this varies by operating system.
+def popen(*args, **kwargs):
+       if have_unix_process_groups: # Call function on spawn to become process group leader
+               kwargs['preexec_fn'] = os.setsid
+       elif have_windows_process_groups: # Set magic flag for Windows process groups
+               kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+       return subprocess.Popen(*args, **kwargs)
+
+def send_signal(proc, sig):
+       if have_unix_process_groups: # UNIX
+               # For compatibility with GNU timeout, pre-send the signal to just the monitored process
+               os.kill(proc.pid, sig)
+               # Send signal to entire group
+               os.killpg(proc.pid, sig)
+               # For compatibility with GNU Timeout, send a SIGCONT after the signal
+               # (so delivery has a chance to occur even for stopped processes)
+               if sig != signal.SIGKILL and sig != signal.SIGCONT:
+                       os.kill(proc.pid, signal.SIGCONT)
+       elif have_windows_process_groups: # Windows with Python 2.7 or better
+               os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
+       else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
+               proc.send_signal(sig) # No way to contact group, just kill process
+
+### Utility functions
+
 def attemptDelete(path):
        try:
                os.remove(path)
@@ -204,6 +233,12 @@ def verbose_print(arg):
        if (verbose):
                print(arg)
 
+def failure_annotate(test):
+       return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
+
+def pluralize(lst):
+       return "s" if len(lst) > 1 else ""
+
 ### Run command
 
 def run(): # Returns exit code
@@ -215,15 +250,10 @@ def run(): # Returns exit code
        code = None
 
        # Set up logging
-       if logging:
-               log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
-                       INVOKE_JSON: " ".join(command)}
-       else:
-               log = None
+       log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
+               INVOKE_JSON: " ".join(command)}
 
        def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
-               if not logging:
-                       return
                if add is not None:
                        if key not in target:
                                target[key] = 0
@@ -231,9 +261,7 @@ def run(): # Returns exit code
                else:
                        target[key] = set
 
-       def log_test(testname, key, set=None, add=None):   # Call to add test-level value to log
-               if not logging:
-                       return
+       def log_test(testname, key, set=None, add=None):   # Call to add test-case-level value to log
                if TESTS_JSON not in log:
                        log[TESTS_JSON] = {}
                if testname not in log[TESTS_JSON]:
@@ -262,9 +290,9 @@ def run(): # Returns exit code
                                        env[RUN_KEY] = ";".join(retry_these)
                                        env[RUN_MODE_KEY] = "RUN"
 
-                       # Run test
+                       # Run test suite
                        try:
-                               proc = subprocess.Popen(command, env=env)
+                               proc = popen(command, env=env)
                        except OSError:
                                died_politely = True
                                sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
@@ -273,23 +301,23 @@ def run(): # Returns exit code
                        code = wait(proc, duration)
                        timed_out = code is None
                        if timed_out:                  # Initial timeout
-                               proc.send_signal(timeout_signal)
+                               send_signal(proc, timeout_signal)
                                if kill_after is not None: # Kill-after timeout
                                        code = wait(proc, kill_after)
                                        if code is None:
-                                               proc.send_signal(9)
+                                               send_signal(proc, signal.SIGKILL)
                                code = proc.wait()         # Waits forever
                                sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
                        died_politely = True
 
-                       # The test has now run, and what happens next varies:
-                       # 1. The test either completed fully without failures, or timed out: Just quit.
-                       # 2. The test crashed (halted without completing):
-                       #   Remember any failures for later and rerun, using a blacklist of tests we have completed.
-                       # 3. The test completed, but there were failures reported:
-                       #   Rerun, using a whitelist of only reported-failed tests.
-                       # 4. The test crashed partway through a run with a whitelist:
-                       #   Rerun, using a whitelist consisting of the previous whitelist minus successful tests.
+                       # The test suite has now run, and what happens next varies:
+                       # 1. The suite either completed fully without failures, or timed out: Just quit.
+                       # 2. The suite crashed (halted without completing):
+                       #   Remember any failures for later and rerun, using a blacklist of testcases we have completed.
+                       # 3. The suite completed, but there were failures reported:
+                       #   Rerun, using a whitelist of only reported-failed testcases.
+                       # 4. The suite crashed partway through a run with a whitelist:
+                       #   Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
 
                        crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
                        failed_tests = attemptLines(env[FAILED_TEST_KEY])
@@ -300,15 +328,14 @@ def run(): # Returns exit code
                                log_value(SUPPORT_JSON, True)
 
                        if not crashed_at and not ever_completed:  # The resume_after whitelist is only
-                               verbose_print ("--- First completion") # used before the first noncrashing run
-                               resume_after = []
+                               resume_after = []                      # used before the first noncrashing run
                                ever_completed = True
 
                        if timed_out:       # Currently no retries after timeout
                                bailout = True
                                code = 124      # See GNU timeout manpage
 
-                       if (code or crashed_at) and babysitting: # Process failures
+                       if code or crashed_at: # Process failures
                                # Handle crash failures
                                if crashed_at and not timed_out:
                                        log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
@@ -316,49 +343,67 @@ def run(): # Returns exit code
                                                bailout = True
 
                                        if failure_may_retry(crashed_at):
-                                               verbose_print( "--- CRASH FAIL    on %s (will retry)" % (crashed_at) )
                                                if ever_completed:           # Rerun with whitelist next time
-                                                       for test in retry_these: # Prepopulate with last whitelist minus run tests
-                                                               if test == crashed_at or test not in ran_tests: # (plus crashed test)
+                                                       for test in retry_these: # Prepopulate with last whitelist minus run testcases
+                                                               if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
                                                                        retry_next.append(test)
                                                else:                            # Rerun with blacklist next time
-                                                       for test in ran_tests:       # Add tests we just ran to blacklist
-                                                               if test != crashed_at:   # (except for the crashed test)
+                                                       for test in ran_tests:       # Add testcases we just ran to blacklist
+                                                               if test != crashed_at:   # (except for the crashed testcase)
                                                                        resume_after.append(test)
                                        else:
-                                               verbose_print( "--- CRASH FAIL    on %s (will NOT retry)" % (crashed_at) )
                                                bailout = True
 
                                # Handle reported failures
                                for test in failed_tests:
                                        log_test(test, TEST_FAILURES, add=1)
                                        if failure_may_retry(test):
-                                               verbose_print( "--- REPORTED FAIL on %s (will retry)" % (test) )
                                                retry_next.append(test)
                                        else:
-                                               verbose_print( "--- REPORTED FAIL on %s (will NOT retry)" % (test) )
                                                bailout = True
 
-                       if bailout or not (babysitting and (resume_after or retry_next)): # If not retrying
+                               # Report human-readable failures for stdout log.
+                               message = "%s:" % (scriptname)
+                               if timed_out:
+                                       message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
+                               elif crashed_at:
+                                       message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
+                               if failed_tests:
+                                       message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
+                               if not (timed_out or crashed_at or failed_tests):
+                                       message += " Test suite terminated with code %d, " % (code)
+                                       if log[SUPPORT_JSON]:
+                                               message += "but failure did not occur during a test case. Halting."
+                                       else:
+                                               message += "and suite cannot report test case data. Halting."
+                               elif bailout:
+                                       message += " Will halt testing."
+                               print(message)
+
+                       if bailout or not (resume_after or retry_next): # If not retrying
                                return code
 
                        # If we got here, a retry is occurring
                        retry_these = retry_next
 
+                       # Report human-readable retry notice for stdout log.
+                       message = "%s: Will rerun test suite" % (scriptname)
+                       if log[COUNT_JSON] > 1:
+                               message += " (iteration #%d)" % (log[COUNT_JSON])
                        if resume_after:
-                               print "Babysitter script will rerun, resuming at crashed test %s" % (crashed_at)
+                               message += ", resuming at crashed testcase %s." % (crashed_at)
                        else:
-                               print "Babysitter script will rerun, running %d failed tests" % (len(retry_these))
-                       verbose_print( "--- Tests pending to rerun: %s" % (retry_these) )
+                               message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
+                       print(message)
        finally:
                # Emergency: Ensure command does not outlive this script
                if proc is not None and not died_politely:
-                       proc.send_signal(9)
+                       send_signal(proc, signal.SIGKILL)
 
                # Write out logs
                log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
                if logging:
                        with open(logfile, "a") as f:
-                               f.write(json.dumps(log) + "\n")
+                               f.write(json.dumps(log) + os.linesep)
 
 sys.exit( run() )