More peer review changes on scripts/babysitter
[mono.git] / scripts / babysitter
1 #!/usr/bin/env python
2
3 # Mimics GNU timeout, but does some fancy tracking based on custom features in mono nunit24.
4
5 import argparse
6 import subprocess
7 import re
8 import signal
9 import time
10 import sys
11 import os.path
12 import copy
13 import tempfile
14 import calendar
15 import json
16
17 ### Constants
18
19 # Here is how the communication with nunit works. It has to work with two constraints:
20 # - We don't invoke nunit directly. We invoke some Makefile which invokes some other Makefile
21 #   and at some point down the line someone calls nunit.
22 # - nunit has to be able to report back to us even if (especially if) it terminates improperly.
23 # To deal with all this, communication babysitter->nunit is done by environment variables,
24 # and communication nunit->babysitter is done by leaving behind files in known locations.
25
26 # Filenames
27
28 CURRENT_TEST_FILE = "babysitter_report_current_test_file.txt"
29 RAN_TEST_FILE     = "babysitter_report_ran_test_file.txt"
30 FAILED_TEST_FILE  = "babysitter_report_failed_test_file.txt"
31 LOGGING_FILE      = "babysitter_report.json_lines"
32
33 # Environment keys
34
35 # Keys used for Babysitter<->Nunit IPC
36 CURRENT_TEST_KEY  = 'MONO_BABYSITTER_NUNIT_CURRENT_TEST_FILE' # Tell nunit where to leave files
37 RAN_TEST_KEY      = 'MONO_BABYSITTER_NUNIT_RAN_TEST_FILE'
38 FAILED_TEST_KEY   = 'MONO_BABYSITTER_NUNIT_FAILED_TEST_FILE'
39 RUN_KEY           = 'MONO_BABYSITTER_NUNIT_RUN_TEST' # Semicolon-separated list of test names
40 RUN_MODE_KEY      = 'MONO_BABYSITTER_NUNIT_RUN_MODE' # Equal to either RUN or AFTER
41
42 # Keys used for script configuration (see --help text)
43 LOGGING_DIR_KEY   = 'WORKSPACE'                      # Path
44 RETRY_KEY         = 'MONO_BABYSITTER_RETRY'          # Equal to an integer
45 VERBOSE_KEY       = 'MONO_BABYSITTER_VERBOSE'        # "Undocumented"-- used for debugging babysitter
46
47 # JSON keys
48
49 DATE_JSON       = 'date'          # POSIX timestamp of test suite run
50 INVOKE_JSON     = 'invocation'
51 COUNT_JSON      = 'iteration'        # How many times was command executed?
52 LIMIT_JSON      = 'failure_max'
53 SUPPORT_JSON    = 'retry_support' # Was the test suite running with a babysitter-aware nunit?
54 FINAL_CODE_JSON = 'final_code'
55 TESTS_JSON      = 'tests'         # Holds dictionary of (test case name)->(dict with TEST_ keys below)
56 TEST_FAILURES         = 'normal_failures'
57 TEST_CRASH_FAILURES   = 'crash_failures'
58 TEST_TIMEOUT_FAILURES = 'timeout_failures'
59
60 ### Interpret arguments
61
62 scriptname = sys.argv[0]
63
64 # This is very silly: This program needs to switch from argparse to manual parsing
65 # after the second positional argument, but argparse doesn't let you do this.
66 # I work around this with a custom subclass which can selectively swallow errors:
67 class Hesitate(Exception):
68         pass
69 class HesitantParser(argparse.ArgumentParser):
70         def __init__(s, *args, **kwargs):
71                 s.hesitating = True  # Errors will be swallowed until this is set false
72                 argparse.ArgumentParser.__init__(s, *args, **kwargs)
73         def error(s, *args, **kwargs):
74                 if s.hesitating:
75                         raise Hesitate() # Bail out before errors are printed.
76                 argparse.ArgumentParser.error(s, *args, **kwargs)
77
78 # Define args
79 argparser = HesitantParser(description="""\
80 Run a test suite with a timeout.\n
81 Durations are floating point numbers followed by an optional unit:\n
82 's' for seconds (the default)
83 'm' for minutes
84 'h' for hours
85 'd' for days\n
86 supported environment variables:
87   %s: Directory path to save logs into
88   %s: If set to a number, failed test cases will be rerun this many times (NUnit test suites only)""" %
89                 (LOGGING_DIR_KEY, RETRY_KEY),
90         formatter_class=argparse.RawTextHelpFormatter)
91 argparser.add_argument('-s', '--signal', dest='signal', metavar='signal', default='TERM',
92         help="Send this signal to the command on timeout, instead of TERM.")
93 argparser.add_argument('-k', '--kill-after-duration', dest='kill_after', metavar='duration',
94         help="If process continues running after signal, send KILL after this duration.")
95 argparser.add_argument('duration',
96         help="Time to run before sending signal.")
97 argparser.add_argument('command', nargs="+", help="Command+args to run.")
98
99 # Repeatedly parse the given args until we find the shortest prefix for which parsing succeeds.
100 argc = len(sys.argv)
101 extra_args = []
102 for limit in range(1,argc+1):
103         try:
104                 if limit == argc: # On the final pass, parse for real
105                         argparser.hesitating = False
106                 args = argparser.parse_args(sys.argv[1:limit])
107                 # If we're still here, parse_args succeeded.
108                 # The final parsed arg is the command; remaining argv items are its args.
109                 extra_args = sys.argv[limit:]
110                 break
111         except Hesitate: # Before the final pass, if parse_args fails, skip
112                 pass
113
114 argparser.hesitating = False # Just act like a normal argparser from here
115
116 durationre = re.compile(r'(\d+)([smhd]?)')
117 def parse_duration(duration): # Accept units
118         match = durationre.match(duration)
119         if not match:
120                 argparser.error("Could not understand duration %s" % duration)
121         time, units = match.group(1), match.group(2)
122         time = int(time)
123         if units == 'm':
124                 time *= 60
125         elif units == 'h':
126                 time *= 60*60
127         elif units == 'd':
128                 time *= 60*60*24
129         return time
130
131 def parse_signal(sig):        # Accept names
132         if sig.isdigit():
133                 return int(sig)
134         for k,v in signal.__dict__.iteritems():
135                 if k == ("SIG%s" % sig):
136                         return v
137         argparser.error("Could not understand signal name %s" % sig)
138
139 # Final interpretation of arguments
140 duration = parse_duration(args.duration)
141 kill_after = parse_duration(args.kill_after) if args.kill_after is not None else None
142 timeout_signal = parse_signal(args.signal)
143 command = args.command + extra_args
144
145 # Process environment
146 global_env = copy.deepcopy( os.environ )
147
148 verbose = VERBOSE_KEY in global_env
149 logging = LOGGING_DIR_KEY in global_env
150 logging_dir = global_env[LOGGING_DIR_KEY] if logging else None
151 logfile = os.path.join(logging_dir, LOGGING_FILE) if logging else None
152 crash_resuming = True # TODO: Consider exposing this option, or adding a retry_on_crash option.
153 failmax = int(global_env[RETRY_KEY]) if RETRY_KEY in global_env else 0
154 babysitting = logging or failmax
155 if babysitting:
156         babysitter_dir = tempfile.mkdtemp()
157         global_env[CURRENT_TEST_KEY] = os.path.join(babysitter_dir, CURRENT_TEST_FILE)
158         global_env[RAN_TEST_KEY]     = os.path.join(babysitter_dir, RAN_TEST_FILE)
159         global_env[FAILED_TEST_KEY]  = os.path.join(babysitter_dir, FAILED_TEST_FILE)
160
161 ### Utility functions
162
163 def wait(proc, duration):
164         # TODO: If we detect Python 3.3, Popen objects have a wait(timeout) method we can use
165         start = time.time()
166         while True:
167                 code = proc.poll()
168                 if code is not None:
169                         return code
170                 if time.time()-start > duration:
171                         return None
172                 time.sleep(0.05)
173
174 def attemptDelete(path):
175         try:
176                 os.remove(path)
177         except OSError:
178                 pass
179
180 def attemptLines(path):
181         try:
182                 with open(path) as f:
183                         return map(lambda s: s.strip('\r\n'), f.readlines())
184         except (OSError, IOError):
185                 return []
186
187 def attemptFirstLine(path):
188         lines = attemptLines(path)
189         if lines:
190                 return lines[0]
191         return None
192
193 def posixtime(): # Amazingly, time.time() does not guarantee an epoch in the docs. However this does:
194         return calendar.timegm(time.gmtime())
195
196 failcount = {}
197 def failure_may_retry(test):
198         if test not in failcount:
199                 failcount[test] = 0
200         failcount[test] += 1
201         return failcount[test] < failmax
202
203 def verbose_print(arg):
204         if (verbose):
205                 print(arg)
206
207 def failure_annotate(test):
208         return "%s (failure #%d of %d allowed)" % (test, failcount[test], failmax)
209
210 def pluralize(lst):
211         return "s" if len(lst) > 1 else ""
212
213 ### Run command
214
215 def run(): # Returns exit code
216         resume_after = []
217         retry_these = []
218         ever_completed = False
219         died_politely = False
220         proc = None
221         code = None
222
223         # Set up logging
224         if logging:
225                 log = {DATE_JSON: posixtime(), COUNT_JSON:0, LIMIT_JSON:failmax, SUPPORT_JSON:False,
226                         INVOKE_JSON: " ".join(command)}
227         else:
228                 log = None
229
230         def log_value(key, set=None, add=None, target=log): # Call to add toplevel value to log
231                 if not logging:
232                         return
233                 if add is not None:
234                         if key not in target:
235                                 target[key] = 0
236                         target[key] += add
237                 else:
238                         target[key] = set
239
240         def log_test(testname, key, set=None, add=None):   # Call to add test-case-level value to log
241                 if not logging:
242                         return
243                 if TESTS_JSON not in log:
244                         log[TESTS_JSON] = {}
245                 if testname not in log[TESTS_JSON]:
246                         log[TESTS_JSON][testname] = {}
247                 log_value(key, set=set, add=add, target=log[TESTS_JSON][testname])
248
249         # Ready to run tests
250         try:
251                 while True:
252                         env = copy.copy(global_env)
253                         if ever_completed:
254                                 retry_next = []
255                         else: # Persist reported failures list until first non-crash run
256                                 retry_next = retry_these
257
258                         log_value(COUNT_JSON, add=1)
259
260                         # Prepare environment/filesystem
261                         if babysitting:
262                                 for key in [CURRENT_TEST_KEY, RAN_TEST_KEY, FAILED_TEST_KEY]:
263                                         attemptDelete(env[key])
264                                 if resume_after:
265                                         env[RUN_KEY] = ";".join(resume_after)
266                                         env[RUN_MODE_KEY] = "EXCLUDE"
267                                 elif retry_these:
268                                         env[RUN_KEY] = ";".join(retry_these)
269                                         env[RUN_MODE_KEY] = "RUN"
270
271                         # Run test suite
272                         try:
273                                 proc = subprocess.Popen(command, env=env)
274                         except OSError:
275                                 died_politely = True
276                                 sys.stderr.write("%s: Could not execute command `%s`\n" % (scriptname, command[0]))
277                                 sys.exit(127)
278
279                         code = wait(proc, duration)
280                         timed_out = code is None
281                         if timed_out:                  # Initial timeout
282                                 proc.send_signal(timeout_signal)
283                                 if kill_after is not None: # Kill-after timeout
284                                         code = wait(proc, kill_after)
285                                         if code is None:
286                                                 proc.send_signal(9)
287                                 code = proc.wait()         # Waits forever
288                                 sys.stderr.write("%s: Command `%s` timed out\n" % (scriptname, command[0]))
289                         died_politely = True
290
291                         # The test suite has now run, and what happens next varies:
292                         # 1. The suite either completed fully without failures, or timed out: Just quit.
293                         # 2. The suite crashed (halted without completing):
294                         #   Remember any failures for later and rerun, using a blacklist of testcases we have completed.
295                         # 3. The suite completed, but there were failures reported:
296                         #   Rerun, using a whitelist of only reported-failed testcases.
297                         # 4. The suite crashed partway through a run with a whitelist:
298                         #   Rerun, using a whitelist consisting of the previous whitelist minus successful testcases.
299
300                         crashed_at = attemptFirstLine(env[CURRENT_TEST_KEY])
301                         failed_tests = attemptLines(env[FAILED_TEST_KEY])
302                         ran_tests = attemptLines(env[RAN_TEST_KEY])
303                         bailout = False
304
305                         if crashed_at or failed_tests or ran_tests: # Test suite follows the babysitter protocol
306                                 log_value(SUPPORT_JSON, True)
307
308                         if not crashed_at and not ever_completed:  # The resume_after whitelist is only
309                                 resume_after = []                      # used before the first noncrashing run
310                                 ever_completed = True
311
312                         if timed_out:       # Currently no retries after timeout
313                                 bailout = True
314                                 code = 124      # See GNU timeout manpage
315
316                         if (code or crashed_at) and babysitting: # Process failures
317                                 # Handle crash failures
318                                 if crashed_at and not timed_out:
319                                         log_test(crashed_at, TEST_CRASH_FAILURES, add=1)
320                                         if not crash_resuming:
321                                                 bailout = True
322
323                                         if failure_may_retry(crashed_at):
324                                                 if ever_completed:           # Rerun with whitelist next time
325                                                         for test in retry_these: # Prepopulate with last whitelist minus run testcases
326                                                                 if test == crashed_at or test not in ran_tests: # (plus crashed testcase)
327                                                                         retry_next.append(test)
328                                                 else:                            # Rerun with blacklist next time
329                                                         for test in ran_tests:       # Add testcases we just ran to blacklist
330                                                                 if test != crashed_at:   # (except for the crashed testcase)
331                                                                         resume_after.append(test)
332                                         else:
333                                                 bailout = True
334
335                                 # Handle reported failures
336                                 for test in failed_tests:
337                                         log_test(test, TEST_FAILURES, add=1)
338                                         if failure_may_retry(test):
339                                                 retry_next.append(test)
340                                         else:
341                                                 bailout = True
342
343                                 # Report human-readable failures for stdout log.
344                                 message = "%s:" % (scriptname)
345                                 if timed_out:
346                                         message += " Saw timeout in test case %s (never allowed)." % (crashed_at)
347                                 elif crashed_at:
348                                         message += " Saw crash in test case %s." % (failure_annotate(crashed_at))
349                                 if failed_tests:
350                                         message += " Saw test failure in test case%s %s." % (pluralize(failed_tests), "; ".join(map(failure_annotate, failed_tests)))
351                                 if bailout:
352                                         message += " Will halt testing."
353                                 print(message)
354
355                         if bailout or not (babysitting and (resume_after or retry_next)): # If not retrying
356                                 return code
357
358                         # If we got here, a retry is occurring
359                         retry_these = retry_next
360
361                         # Report human-readable retry notice for stdout log.
362                         message = "%s: Will rerun test suite" % (scriptname)
363                         if log[COUNT_JSON] > 1:
364                                 message += " (iteration #%d)" % (log[COUNT_JSON])
365                         if resume_after:
366                                 message += ", resuming at crashed testcase %s." % (crashed_at)
367                         else:
368                                 message += ", running previously failed testcase%s: %s." % (pluralize(retry_these), "; ".join(retry_these))
369                         print(message)
370         finally:
371                 # Emergency: Ensure command does not outlive this script
372                 if proc is not None and not died_politely:
373                         proc.send_signal(9)
374
375                 # Write out logs
376                 log_value(FINAL_CODE_JSON, "EXCEPTION" if code is None else code)
377                 if logging:
378                         with open(logfile, "a") as f:
379                                 f.write(json.dumps(log) + os.linesep)
380
381 sys.exit( run() )