Support process-group killing in babysitter script
authorAndi McClure <andi.mcclure@xamarin.com>
Tue, 22 Dec 2015 22:38:16 +0000 (17:38 -0500)
committerAndi McClure <andi.mcclure@xamarin.com>
Tue, 22 Dec 2015 22:38:16 +0000 (17:38 -0500)
The babysitter script was failing to act as a GNU timeout replacement
because it was delivering signals to the spawned process only, whereas
GNU timeout would create a process group. This lead to a problem where
timeout kills by the babysitter would leave children of the spawned
process still running. To fix this, the babysitter script now follows
the exact steps that GNU timeout does (or as close as is possible in
Python). An attempt at a Windows process-group implementation is also
included, but I have not tested it.

Also changed the 'retry_support' key to 'babysitter_protocol' in the
JSON logs, because it was pointed out the old key was misleading (it
is true even when retries are deactivated).

scripts/babysitter

index 3786fa0f98be13cbcdc187026fdcf4f42f388d39..42b038bb076e1e8e888baf3074b16ef8828badd3 100755 (executable)
@@ -50,7 +50,7 @@ DATE_JSON       = 'date'          # POSIX timestamp of test suite run
 INVOKE_JSON     = 'invocation'
 COUNT_JSON      = 'iteration'        # How many times was command executed?
 LIMIT_JSON      = 'failure_max'
-SUPPORT_JSON    = 'retry_support' # Was the test suite running with a babysitter-aware nunit?
+SUPPORT_JSON    = 'babysitter_protocol' # Was the test suite running with a babysitter-aware nunit?
 FINAL_CODE_JSON = 'final_code'
 TESTS_JSON      = 'tests'         # Holds dictionary of (test case name)->(dict with TEST_ keys below)
 TEST_FAILURES         = 'normal_failures'
@@ -157,7 +157,10 @@ if babysitting:
        global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
        global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
 
-### Utility functions
+have_unix_process_groups = 'killpg' in os.__dict__
+have_windows_process_groups = 'CREATE_NEW_PROCESS_GROUP' in subprocess.__dict__
+
+### Timeout implementation
 
 def wait(proc, duration):
        # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
@@ -170,6 +173,33 @@ def wait(proc, duration):
                        return None
                time.sleep(0.05)
 
+# Popen and send_signal can't be called in their basic forms because we want to
+# send signals to all children, not just to the immediately spawned process.
+# Unfortunately the way to do this varies by operating system.
+def popen(*args, **kwargs):
+       if have_unix_process_groups: # Call function on spawn to become process group leader
+               kwargs['preexec_fn'] = os.setsid
+       elif have_windows_process_groups: # Set magic flag for Windows process groups
+               kwargs['creationflags'] = subprocess.CREATE_NEW_PROCESS_GROUP
+       return subprocess.Popen(*args, **kwargs)
+
+def send_signal(proc, sig):
+       if have_unix_process_groups: # UNIX
+               # For compatibility with GNU timeout, pre-send the signal to just the monitored process
+               os.kill(proc.pid, sig)
+               # Send signal to entire group
+               os.killpg(proc.pid, sig)
+               # For compatibility with GNU Timeout, send a SIGCONT after the signal
+               # (so delivery has a chance to occur even for stopped processes)
+               if sig != signal.SIGKILL and sig != signal.SIGCONT:
+                       os.kill(proc.pid, signal.SIGCONT)
+       elif have_windows_process_groups: # Windows with Python 2.7 or better
+               os.kill(proc.pid, sig) # Becuase CREATE_NEW_PROCESS_GROUP, will go to entire group
+       else: # Windows with Python 2.6-- CREATE_NEW_PROCESS_GROUP not supported
+               proc.send_signal(sig) # No way to contact group, just kill process
+
+### Utility functions
+
 def attemptDelete(path):
        try:
                os.remove(path)
@@ -269,7 +299,7 @@ def run(): # Returns exit code
 
                        # Run test suite
                        try:
-                               proc = subprocess.Popen(command, env=env)
+                               proc = popen(command, env=env)
                        except OSError:
                                died_politely = True
                                sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
@@ -278,11 +308,11 @@ def run(): # Returns exit code
                        code = wait(proc, duration)
                        timed_out = code is None
                        if timed_out:                  # Initial timeout
-                               proc.send_signal(timeout_signal)
+                               send_signal(proc, timeout_signal)
                                if kill_after is not None: # Kill-after timeout
                                        code = wait(proc, kill_after)
                                        if code is None:
-                                               proc.send_signal(9)
+                                               send_signal(proc, signal.SIGKILL)
                                code = proc.wait()         # Waits forever
                                sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
                        died_politely = True
@@ -369,7 +399,7 @@ def run(): # Returns exit code
        finally:
                # Emergency: Ensure command does not outlive this script
                if proc is not None and not died_politely:
-                       proc.send_signal(9)
+                       send_signal(proc, signal.SIGKILL)
 
                # Write out logs
                log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)