[runtime] Fix potential overflow when using mono_msec_ticks
[mono.git] / scripts / babysitter
index 9940e8ab89c9c3c0e9ad3533fa8acbabd75d1ae0..f8b5a6c345ca1545b6e47e06d44172ce7c6a69b9 100755 (executable)
@@ -40,7 +40,7 @@ RUN_KEY           = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list
 RUN_MODE_KEY      = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
 
 # Keys used for script configuration (see --help text)
-LOGGING_DIR_KEY   = 'WORKSPACE'                      # Path
+LOG_FILE_KEY      = 'MONO_BABYSITTER_LOG_FILE'       # Path
 RETRY_KEY         = 'MONO_BABYSITTER_RETRY'          # Equal to an integer
 VERBOSE_KEY       = 'MONO_BABYSITTER_VERBOSE'        # "Undocumented"-- used for debugging babysitter
 
@@ -50,7 +50,7 @@ DATE_JSON       = 'date'          # POSIX timestamp of test suite run
 INVOKE_JSON     = 'invocation'
 COUNT_JSON      = 'iteration'        # How many times was command executed?
 LIMIT_JSON      = 'failure_max'
-SUPPORT_JSON    = 'retry_support' # Was the test suite running with a babysitter-aware nunit?
+SUPPORT_JSON    = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
 FINAL_CODE_JSON = 'final_code'
 TESTS_JSON      = 'tests'         # Holds dictionary of (test case name)->(dict with TEST_ keys below)
 TEST_FAILURES         = 'normal_failures'
@@ -84,9 +84,9 @@ Durations are floating point numbers followed by an optional unit:\n
 'h' for hours
 'd' for days\n
 supported environment variables:
-  %s: Directory path to save logs into
+  %s: File to write logs to (as line-delimited JSON)
   %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
-               (LOGGING_DIR_KEY, RETRY_KEY),
+               (LOG_FILE_KEY, RETRY_KEY),
        formatter_class=argparse.RawTextHelpFormatter)
 argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
        help="Send this signal to the command on timeout, instead of TERM.")
@@ -146,19 +146,21 @@ command = args.command + extra_args
 global_env = copy.deepcopy( os.environ )
 
 verbose = VERBOSE_KEY in global_env
-logging = LOGGING_DIR_KEY in global_env
-logging_dir = global_env[LOGGING_DIR_KEY] if logging else None
-logfile = os.path.join(logging_dir, LOGGING_FILE) if logging else None
+logging = LOG_FILE_KEY in global_env
+logfile = global_env[LOG_FILE_KEY] if logging else None
 crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
 failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
-babysitting = logging or failmax
+babysitting = True # If false, babysitter becomes a timeout clone with no env manipulation or anything.
 if babysitting:
        babysitter_dir = tempfile.mkdtemp()
        global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
        global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
        global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
 
-### Utility functions
+have_unix_process_groups = 'killpg' in os.__dict__
+have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
+
+### Timeout implementation
 
 def wait(proc, duration):
        # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
@@ -171,6 +173,33 @@ def wait(proc, duration):
                        return None
                time.sleep(0.05)
 
+# Popen and send_signal can't be called in their basic forms because we want to
+# send signals to all children, not just to the immediately spawned process.
+# Unfortunately the way to do this varies by operating system.
+def popen(*args, **kwargs):
+       if have_unix_process_groups: # Call function on spawn to become process group leader
+               kwargs['preexec_fn'] = os.setsid
+       elif have_windows_process_groups: # Set magic flag for Windows process groups
+               kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+       return subprocess.Popen(*args, **kwargs)
+
+def send_signal(proc, sig):
+       if have_unix_process_groups: # UNIX
+               # For compatibility with GNU timeout, pre-send the signal to just the monitored process
+               os.kill(proc.pid, sig)
+               # Send signal to entire group
+               os.killpg(proc.pid, sig)
+               # For compatibility with GNU Timeout, send a SIGCONT after the signal
+               # (so delivery has a chance to occur even for stopped processes)
+               if sig != signal.SIGKILL and sig != signal.SIGCONT:
+                       os.kill(proc.pid, signal.SIGCONT)
+       elif have_windows_process_groups: # Windows with Python 2.7 or better
+               os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
+       else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
+               proc.send_signal(sig) # No way to contact group, just kill process
+
+### Utility functions
+
 def attemptDelete(path):
        try:
                os.remove(path)
@@ -221,15 +250,10 @@ def run(): # Returns exit code
        code = None
 
        # Set up logging
-       if logging:
-               log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
-                       INVOKE_JSON: " ".join(command)}
-       else:
-               log = None
+       log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
+               INVOKE_JSON: " ".join(command)}
 
        def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
-               if not logging:
-                       return
                if add is not None:
                        if key not in target:
                                target[key] = 0
@@ -238,8 +262,6 @@ def run(): # Returns exit code
                        target[key] = set
 
        def log_test(testname, key, set=None, add=None):   # Call to add test-case-level value to log
-               if not logging:
-                       return
                if TESTS_JSON not in log:
                        log[TESTS_JSON] = {}
                if testname not in log[TESTS_JSON]:
@@ -270,7 +292,7 @@ def run(): # Returns exit code
 
                        # Run test suite
                        try:
-                               proc = subprocess.Popen(command, env=env)
+                               proc = popen(command, env=env)
                        except OSError:
                                died_politely = True
                                sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
@@ -279,11 +301,11 @@ def run(): # Returns exit code
                        code = wait(proc, duration)
                        timed_out = code is None
                        if timed_out:                  # Initial timeout
-                               proc.send_signal(timeout_signal)
+                               send_signal(proc, timeout_signal)
                                if kill_after is not None: # Kill-after timeout
                                        code = wait(proc, kill_after)
                                        if code is None:
-                                               proc.send_signal(9)
+                                               send_signal(proc, signal.SIGKILL)
                                code = proc.wait()         # Waits forever
                                sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
                        died_politely = True
@@ -313,7 +335,7 @@ def run(): # Returns exit code
                                bailout = True
                                code = 124      # See GNU timeout manpage
 
-                       if (code or crashed_at) and babysitting: # Process failures
+                       if code or crashed_at: # Process failures
                                # Handle crash failures
                                if crashed_at and not timed_out:
                                        log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
@@ -348,11 +370,17 @@ def run(): # Returns exit code
                                        message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
                                if failed_tests:
                                        message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
-                               if bailout:
+                               if not (timed_out or crashed_at or failed_tests):
+                                       message += " Test suite terminated with code %d, " % (code)
+                                       if log[SUPPORT_JSON]:
+                                               message += "but failure did not occur during a test case. Halting."
+                                       else:
+                                               message += "and suite cannot report test case data. Halting."
+                               elif bailout:
                                        message += " Will halt testing."
                                print(message)
 
-                       if bailout or not (babysitting and (resume_after or retry_next)): # If not retrying
+                       if bailout or not (resume_after or retry_next): # If not retrying
                                return code
 
                        # If we got here, a retry is occurring
@@ -370,7 +398,7 @@ def run(): # Returns exit code
        finally:
                # Emergency: Ensure command does not outlive this script
                if proc is not None and not died_politely:
-                       proc.send_signal(9)
+                       send_signal(proc, signal.SIGKILL)
 
                # Write out logs
                log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)