[sgen] Improve STW debugging output
[mono.git] / mono / metadata / sgen-stw.c
1 /*
2  * sgen-stw.c: Stop the world functionality
3  *
4  * Author:
5  *      Paolo Molaro (lupus@ximian.com)
6  *  Rodrigo Kumpera (kumpera@gmail.com)
7  *
8  * Copyright 2005-2011 Novell, Inc (http://www.novell.com)
9  * Copyright 2011 Xamarin Inc (http://www.xamarin.com)
10  * Copyright 2011 Xamarin, Inc.
11  * Copyright (C) 2012 Xamarin Inc
12  *
13  * Licensed under the MIT license. See LICENSE file in the project root for full license information.
14  */
15
16 #include "config.h"
17 #ifdef HAVE_SGEN_GC
18
19 #include "sgen/sgen-gc.h"
20 #include "sgen/sgen-protocol.h"
21 #include "sgen/sgen-memory-governor.h"
22 #include "sgen/sgen-thread-pool.h"
23 #include "metadata/profiler-private.h"
24 #include "sgen/sgen-client.h"
25 #include "metadata/sgen-bridge-internals.h"
26 #include "metadata/gc-internals.h"
27 #include "utils/mono-threads.h"
28
29 #define TV_DECLARE SGEN_TV_DECLARE
30 #define TV_GETTIME SGEN_TV_GETTIME
31 #define TV_ELAPSED SGEN_TV_ELAPSED
32
33 static void sgen_unified_suspend_restart_world (void);
34 static void sgen_unified_suspend_stop_world (void);
35
36 unsigned int sgen_global_stop_count = 0;
37
38 inline static void*
39 align_pointer (void *ptr)
40 {
41         mword p = (mword)ptr;
42         p += sizeof (gpointer) - 1;
43         p &= ~ (sizeof (gpointer) - 1);
44         return (void*)p;
45 }
46
47 static MonoContext cur_thread_ctx;
48
49 static void
50 update_current_thread_stack (void *start)
51 {
52         int stack_guard = 0;
53         SgenThreadInfo *info = mono_thread_info_current ();
54         
55         info->client_info.stack_start = align_pointer (&stack_guard);
56         g_assert (info->client_info.stack_start);
57         g_assert (info->client_info.stack_start >= info->client_info.stack_start_limit && info->client_info.stack_start < info->client_info.stack_end);
58         MONO_CONTEXT_GET_CURRENT (cur_thread_ctx);
59         memcpy (&info->client_info.ctx, &cur_thread_ctx, sizeof (MonoContext));
60         if (mono_gc_get_gc_callbacks ()->thread_suspend_func)
61                 mono_gc_get_gc_callbacks ()->thread_suspend_func (info->client_info.runtime_data, NULL, &info->client_info.ctx);
62 }
63
64 static gboolean
65 is_ip_in_managed_allocator (MonoDomain *domain, gpointer ip)
66 {
67         MonoJitInfo *ji;
68
69         if (!mono_thread_internal_current ())
70                 /* Happens during thread attach */
71                 return FALSE;
72
73         if (!ip || !domain)
74                 return FALSE;
75         if (!sgen_has_critical_method ())
76                 return FALSE;
77
78         /*
79          * mono_jit_info_table_find is not async safe since it calls into the AOT runtime to load information for
80          * missing methods (#13951). To work around this, we disable the AOT fallback. For this to work, the JIT needs
81          * to register the jit info for all GC critical methods after they are JITted/loaded.
82          */
83         ji = mono_jit_info_table_find_internal (domain, (char *)ip, FALSE, FALSE);
84         if (!ji)
85                 return FALSE;
86
87         return sgen_is_critical_method (mono_jit_info_get_method (ji));
88 }
89
90 static int
91 restart_threads_until_none_in_managed_allocator (void)
92 {
93         int num_threads_died = 0;
94         int sleep_duration = -1;
95
96         for (;;) {
97                 int restart_count = 0, restarted_count = 0;
98                 /* restart all threads that stopped in the
99                    allocator */
100                 FOREACH_THREAD (info) {
101                         gboolean result;
102                         if (info->client_info.skip || info->client_info.gc_disabled || info->client_info.suspend_done)
103                                 continue;
104                         if (mono_thread_info_is_live (info) &&
105                                         (!info->client_info.stack_start || info->client_info.in_critical_region || info->client_info.info.inside_critical_region ||
106                                         is_ip_in_managed_allocator (info->client_info.stopped_domain, info->client_info.stopped_ip))) {
107                                 binary_protocol_thread_restart ((gpointer)mono_thread_info_get_tid (info));
108                                 SGEN_LOG (3, "thread %p resumed.", (void*) (size_t) info->client_info.info.native_handle);
109                                 result = sgen_resume_thread (info);
110                                 if (result) {
111                                         ++restart_count;
112                                 } else {
113                                         info->client_info.skip = 1;
114                                 }
115                         } else {
116                                 /* we set the stopped_ip to
117                                    NULL for threads which
118                                    we're not restarting so
119                                    that we can easily identify
120                                    the others */
121                                 info->client_info.stopped_ip = NULL;
122                                 info->client_info.stopped_domain = NULL;
123                                 info->client_info.suspend_done = TRUE;
124                         }
125                 } FOREACH_THREAD_END
126                 /* if no threads were restarted, we're done */
127                 if (restart_count == 0)
128                         break;
129
130                 /* wait for the threads to signal their restart */
131                 sgen_wait_for_suspend_ack (restart_count);
132
133                 if (sleep_duration < 0) {
134                         mono_thread_info_yield ();
135                         sleep_duration = 0;
136                 } else {
137                         g_usleep (sleep_duration);
138                         sleep_duration += 10;
139                 }
140
141                 /* stop them again */
142                 FOREACH_THREAD (info) {
143                         gboolean result;
144                         if (info->client_info.skip || info->client_info.stopped_ip == NULL)
145                                 continue;
146                         result = sgen_suspend_thread (info);
147
148                         if (result) {
149                                 ++restarted_count;
150                         } else {
151                                 info->client_info.skip = 1;
152                         }
153                 } FOREACH_THREAD_END
154                 /* some threads might have died */
155                 num_threads_died += restart_count - restarted_count;
156                 /* wait for the threads to signal their suspension
157                    again */
158                 sgen_wait_for_suspend_ack (restarted_count);
159         }
160
161         return num_threads_died;
162 }
163
164 static void
165 acquire_gc_locks (void)
166 {
167         LOCK_INTERRUPTION;
168         mono_thread_info_suspend_lock ();
169 }
170
171 static void
172 release_gc_locks (void)
173 {
174         mono_thread_info_suspend_unlock ();
175         UNLOCK_INTERRUPTION;
176 }
177
178 static TV_DECLARE (stop_world_time);
179 static unsigned long max_pause_usec = 0;
180
181 static guint64 time_stop_world;
182 static guint64 time_restart_world;
183
184 /* LOCKING: assumes the GC lock is held */
185 void
186 sgen_client_stop_world (int generation)
187 {
188         TV_DECLARE (end_handshake);
189
190         /* notify the profiler of the leftovers */
191         /* FIXME this is the wrong spot at we can STW for non collection reasons. */
192         if (G_UNLIKELY (mono_profiler_events & MONO_PROFILE_GC_MOVES))
193                 mono_sgen_gc_event_moves ();
194
195         acquire_gc_locks ();
196
197         /* We start to scan after locks are taking, this ensures we won't be interrupted. */
198         sgen_process_togglerefs ();
199
200         update_current_thread_stack (&generation);
201
202         sgen_global_stop_count++;
203         SGEN_LOG (3, "stopping world n %d from %p %p", sgen_global_stop_count, mono_thread_info_current (), (gpointer) (gsize) mono_native_thread_id_get ());
204         TV_GETTIME (stop_world_time);
205
206         if (mono_thread_info_unified_management_enabled ()) {
207                 sgen_unified_suspend_stop_world ();
208         } else {
209                 int count, dead;
210                 count = sgen_thread_handshake (TRUE);
211                 dead = restart_threads_until_none_in_managed_allocator ();
212                 if (count < dead)
213                         g_error ("More threads have died (%d) that been initialy suspended %d", dead, count);
214         }
215
216         SGEN_LOG (3, "world stopped");
217
218         TV_GETTIME (end_handshake);
219         time_stop_world += TV_ELAPSED (stop_world_time, end_handshake);
220
221         sgen_memgov_collection_start (generation);
222         if (sgen_need_bridge_processing ())
223                 sgen_bridge_reset_data ();
224 }
225
226 /* LOCKING: assumes the GC lock is held */
227 void
228 sgen_client_restart_world (int generation, GGTimingInfo *timing)
229 {
230         TV_DECLARE (end_sw);
231         TV_DECLARE (start_handshake);
232         TV_DECLARE (end_bridge);
233         unsigned long usec, bridge_usec;
234
235         /* notify the profiler of the leftovers */
236         /* FIXME this is the wrong spot at we can STW for non collection reasons. */
237         if (G_UNLIKELY (mono_profiler_events & MONO_PROFILE_GC_MOVES))
238                 mono_sgen_gc_event_moves ();
239
240         FOREACH_THREAD (info) {
241                 info->client_info.stack_start = NULL;
242                 memset (&info->client_info.ctx, 0, sizeof (MonoContext));
243         } FOREACH_THREAD_END
244
245         TV_GETTIME (start_handshake);
246
247         if (mono_thread_info_unified_management_enabled ())
248                 sgen_unified_suspend_restart_world ();
249         else
250                 sgen_thread_handshake (FALSE);
251
252         TV_GETTIME (end_sw);
253         time_restart_world += TV_ELAPSED (start_handshake, end_sw);
254         usec = TV_ELAPSED (stop_world_time, end_sw);
255         max_pause_usec = MAX (usec, max_pause_usec);
256
257         SGEN_LOG (2, "restarted (pause time: %d usec, max: %d)", (int)usec, (int)max_pause_usec);
258
259         /*
260          * We must release the thread info suspend lock after doing
261          * the thread handshake.  Otherwise, if the GC stops the world
262          * and a thread is in the process of starting up, but has not
263          * yet registered (it's not in the thread_list), it is
264          * possible that the thread does register while the world is
265          * stopped.  When restarting the GC will then try to restart
266          * said thread, but since it never got the suspend signal, it
267          * cannot answer the restart signal, so a deadlock results.
268          */
269         release_gc_locks ();
270
271         TV_GETTIME (end_bridge);
272         bridge_usec = TV_ELAPSED (end_sw, end_bridge);
273
274         if (timing) {
275                 timing [0].stw_time = usec;
276                 timing [0].bridge_time = bridge_usec;
277         }
278 }
279
280 void
281 mono_sgen_init_stw (void)
282 {
283         mono_counters_register ("World stop", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_stop_world);
284         mono_counters_register ("World restart", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_restart_world);
285 }
286
287 /* Unified suspend code */
288
289 static gboolean
290 sgen_is_thread_in_current_stw (SgenThreadInfo *info, int *reason)
291 {
292         /*
293         A thread explicitly asked to be skiped because it holds no managed state.
294         This is used by TP and finalizer threads.
295         FIXME Use an atomic variable for this to avoid everyone taking the GC LOCK.
296         */
297         if (info->client_info.gc_disabled) {
298                 if (reason)
299                         *reason = 1;
300                 return FALSE;
301         }
302
303         /*
304         We have detected that this thread is failing/dying, ignore it.
305         FIXME: can't we merge this with thread_is_dying?
306         */
307         if (info->client_info.skip) {
308                 if (reason)
309                         *reason = 2;
310                 return FALSE;
311         }
312
313         /*
314         Suspending the current thread will deadlock us, bad idea.
315         */
316         if (info == mono_thread_info_current ()) {
317                 if (reason)
318                         *reason = 3;
319                 return FALSE;
320         }
321
322         /*
323         We can't suspend the workers that will do all the heavy lifting.
324         FIXME Use some state bit in SgenThreadInfo for this.
325         */
326         if (sgen_thread_pool_is_thread_pool_thread (mono_thread_info_get_tid (info))) {
327                 if (reason)
328                         *reason = 4;
329                 return FALSE;
330         }
331
332         /*
333         The thread has signaled that it started to detach, ignore it.
334         FIXME: can't we merge this with skip
335         */
336         if (!mono_thread_info_is_live (info)) {
337                 if (reason)
338                         *reason = 5;
339                 return FALSE;
340         }
341
342         return TRUE;
343 }
344
345 static void
346 update_sgen_info (SgenThreadInfo *info)
347 {
348         char *stack_start;
349
350         /* Once we remove the old suspend code, we should move sgen to directly access the state in MonoThread */
351         info->client_info.stopped_domain = (MonoDomain *)mono_thread_info_tls_get (info, TLS_KEY_DOMAIN);
352         info->client_info.stopped_ip = (gpointer) MONO_CONTEXT_GET_IP (&mono_thread_info_get_suspend_state (info)->ctx);
353         stack_start = (char*)MONO_CONTEXT_GET_SP (&mono_thread_info_get_suspend_state (info)->ctx) - REDZONE_SIZE;
354
355         /* altstack signal handler, sgen can't handle them, mono-threads should have handled this. */
356         if (stack_start < (char*)info->client_info.stack_start_limit || stack_start >= (char*)info->client_info.stack_end)
357                 g_error ("BAD STACK");
358
359         info->client_info.stack_start = stack_start;
360         g_assert (info->client_info.stack_start);
361         info->client_info.ctx = mono_thread_info_get_suspend_state (info)->ctx;
362 }
363
364 static void
365 sgen_unified_suspend_stop_world (void)
366 {
367         int restart_counter;
368         int sleep_duration = -1;
369
370         mono_threads_begin_global_suspend ();
371         THREADS_STW_DEBUG ("[GC-STW-BEGIN] *** BEGIN SUSPEND *** \n");
372
373         FOREACH_THREAD (info) {
374                 int reason;
375                 info->client_info.skip = FALSE;
376                 info->client_info.suspend_done = FALSE;
377                 if (sgen_is_thread_in_current_stw (info, &reason)) {
378                         info->client_info.skip = !mono_thread_info_begin_suspend (info);
379                         THREADS_STW_DEBUG ("[GC-STW-BEGIN-SUSPEND] SUSPEND thread %p skip %d\n", mono_thread_info_get_tid (info), info->client_info.skip);
380                 } else {
381                         THREADS_STW_DEBUG ("[GC-STW-BEGIN-SUSPEND] IGNORE thread %p skip %d reason %d\n", mono_thread_info_get_tid (info), info->client_info.skip, reason);
382                 }
383         } FOREACH_THREAD_END
384
385         mono_thread_info_current ()->client_info.suspend_done = TRUE;
386         mono_threads_wait_pending_operations ();
387
388         for (;;) {
389                 restart_counter = 0;
390                 FOREACH_THREAD (info) {
391                         int reason = 0;
392                         if (info->client_info.suspend_done || !sgen_is_thread_in_current_stw (info, &reason)) {
393                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] IGNORE RESUME thread %p not been processed done %d current %d reason %d\n", mono_thread_info_get_tid (info), info->client_info.suspend_done, !sgen_is_thread_in_current_stw (info, NULL), reason);
394                                 continue;
395                         }
396
397                         /*
398                         All threads that reach here are pristine suspended. This means the following:
399
400                         - We haven't accepted the previous suspend as good.
401                         - We haven't gave up on it for this STW (it's either bad or asked not to)
402                         */
403                         if (!mono_thread_info_check_suspend_result (info)) {
404                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] SKIP thread %p failed to finish to suspend\n", mono_thread_info_get_tid (info));
405                                 info->client_info.skip = TRUE;
406                         } else if (mono_thread_info_in_critical_location (info)) {
407                                 gboolean res;
408                                 gint suspend_count = mono_thread_info_suspend_count (info);
409                                 if (!(suspend_count == 1))
410                                         g_error ("[%p] suspend_count = %d, but should be 1", mono_thread_info_get_tid (info), suspend_count);
411                                 res = mono_thread_info_begin_resume (info);
412                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] RESTART thread %p skip %d\n", mono_thread_info_get_tid (info), res);
413                                 if (res)
414                                         ++restart_counter;
415                                 else
416                                         info->client_info.skip = TRUE;
417                         } else {
418                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] DONE thread %p deemed fully suspended\n", mono_thread_info_get_tid (info));
419                                 g_assert (!info->client_info.in_critical_region);
420                                 info->client_info.suspend_done = TRUE;
421                         }
422                 } FOREACH_THREAD_END
423
424                 if (restart_counter == 0)
425                         break;
426                 mono_threads_wait_pending_operations ();
427
428                 if (sleep_duration < 0) {
429                         mono_thread_info_yield ();
430                         sleep_duration = 0;
431                 } else {
432                         g_usleep (sleep_duration);
433                         sleep_duration += 10;
434                 }
435
436                 FOREACH_THREAD (info) {
437                         int reason = 0;
438                         if (info->client_info.suspend_done || !sgen_is_thread_in_current_stw (info, &reason)) {
439                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] IGNORE SUSPEND thread %p not been processed done %d current %d reason %d\n", mono_thread_info_get_tid (info), info->client_info.suspend_done, !sgen_is_thread_in_current_stw (info, NULL), reason);
440                                 continue;
441                         }
442
443                         if (mono_thread_info_is_running (info)) {
444                                 gboolean res = mono_thread_info_begin_suspend (info);
445                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] SUSPEND thread %p skip %d\n", mono_thread_info_get_tid (info), res);
446                                 if (!res)
447                                         info->client_info.skip = TRUE;
448                         }
449                 } FOREACH_THREAD_END
450
451                 mono_threads_wait_pending_operations ();
452         }
453
454         FOREACH_THREAD (info) {
455                 int reason = 0;
456                 if (sgen_is_thread_in_current_stw (info, &reason)) {
457                         THREADS_STW_DEBUG ("[GC-STW-SUSPEND-END] thread %p is suspended\n", mono_thread_info_get_tid (info));
458                         g_assert (info->client_info.suspend_done);
459                         update_sgen_info (info);
460                 } else {
461                         THREADS_STW_DEBUG ("[GC-STW-SUSPEND-END] thread %p is NOT suspended, reason %d\n", mono_thread_info_get_tid (info), reason);
462                         g_assert (!info->client_info.suspend_done || info == mono_thread_info_current ());
463                 }
464         } FOREACH_THREAD_END
465 }
466
467 static void
468 sgen_unified_suspend_restart_world (void)
469 {
470         THREADS_STW_DEBUG ("[GC-STW-END] *** BEGIN RESUME ***\n");
471         FOREACH_THREAD (info) {
472                 int reason = 0;
473                 if (sgen_is_thread_in_current_stw (info, &reason)) {
474                         g_assert (mono_thread_info_begin_resume (info));
475                         THREADS_STW_DEBUG ("[GC-STW-RESUME-WORLD] RESUME thread %p\n", mono_thread_info_get_tid (info));
476                 } else {
477                         THREADS_STW_DEBUG ("[GC-STW-RESUME-WORLD] IGNORE thread %p, reason %d\n", mono_thread_info_get_tid (info), reason);
478                 }
479         } FOREACH_THREAD_END
480
481         mono_threads_wait_pending_operations ();
482         mono_threads_end_global_suspend ();
483 }
484 #endif