[threads] Use runtime flag to enable cooperative suspend
[mono.git] / mono / metadata / sgen-stw.c
1 /*
2  * sgen-stw.c: Stop the world functionality
3  *
4  * Author:
5  *      Paolo Molaro (lupus@ximian.com)
6  *  Rodrigo Kumpera (kumpera@gmail.com)
7  *
8  * Copyright 2005-2011 Novell, Inc (http://www.novell.com)
9  * Copyright 2011 Xamarin Inc (http://www.xamarin.com)
10  * Copyright 2011 Xamarin, Inc.
11  * Copyright (C) 2012 Xamarin Inc
12  *
13  * This library is free software; you can redistribute it and/or
14  * modify it under the terms of the GNU Library General Public
15  * License 2.0 as published by the Free Software Foundation;
16  *
17  * This library is distributed in the hope that it will be useful,
18  * but WITHOUT ANY WARRANTY; without even the implied warranty of
19  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20  * Library General Public License for more details.
21  *
22  * You should have received a copy of the GNU Library General Public
23  * License 2.0 along with this library; if not, write to the Free
24  * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25  */
26
27 #include "config.h"
28 #ifdef HAVE_SGEN_GC
29
30 #include "sgen/sgen-gc.h"
31 #include "sgen/sgen-protocol.h"
32 #include "sgen/sgen-memory-governor.h"
33 #include "sgen/sgen-thread-pool.h"
34 #include "metadata/profiler-private.h"
35 #include "sgen/sgen-client.h"
36 #include "metadata/sgen-bridge-internal.h"
37 #include "metadata/gc-internal.h"
38
39 #define TV_DECLARE SGEN_TV_DECLARE
40 #define TV_GETTIME SGEN_TV_GETTIME
41 #define TV_ELAPSED SGEN_TV_ELAPSED
42
43 static void sgen_unified_suspend_restart_world (void);
44 static void sgen_unified_suspend_stop_world (void);
45
46 unsigned int sgen_global_stop_count = 0;
47
48 inline static void*
49 align_pointer (void *ptr)
50 {
51         mword p = (mword)ptr;
52         p += sizeof (gpointer) - 1;
53         p &= ~ (sizeof (gpointer) - 1);
54         return (void*)p;
55 }
56
57 #ifdef USE_MONO_CTX
58 static MonoContext cur_thread_ctx;
59 #else
60 static mword cur_thread_regs [ARCH_NUM_REGS];
61 #endif
62
63 static void
64 update_current_thread_stack (void *start)
65 {
66         int stack_guard = 0;
67 #if !defined(USE_MONO_CTX)
68         void *reg_ptr = cur_thread_regs;
69 #endif
70         SgenThreadInfo *info = mono_thread_info_current ();
71         
72         info->client_info.stack_start = align_pointer (&stack_guard);
73         g_assert (info->client_info.stack_start >= info->client_info.stack_start_limit && info->client_info.stack_start < info->client_info.stack_end);
74 #ifdef USE_MONO_CTX
75         MONO_CONTEXT_GET_CURRENT (cur_thread_ctx);
76         memcpy (&info->client_info.ctx, &cur_thread_ctx, sizeof (MonoContext));
77         if (mono_gc_get_gc_callbacks ()->thread_suspend_func)
78                 mono_gc_get_gc_callbacks ()->thread_suspend_func (info->client_info.runtime_data, NULL, &info->client_info.ctx);
79 #else
80         ARCH_STORE_REGS (reg_ptr);
81         memcpy (&info->client_info.regs, reg_ptr, sizeof (info->client_info.regs));
82         if (mono_gc_get_gc_callbacks ()->thread_suspend_func)
83                 mono_gc_get_gc_callbacks ()->thread_suspend_func (info->client_info.runtime_data, NULL, NULL);
84 #endif
85 }
86
87 static gboolean
88 is_ip_in_managed_allocator (MonoDomain *domain, gpointer ip)
89 {
90         MonoJitInfo *ji;
91
92         if (!mono_thread_internal_current ())
93                 /* Happens during thread attach */
94                 return FALSE;
95
96         if (!ip || !domain)
97                 return FALSE;
98         if (!sgen_has_critical_method ())
99                 return FALSE;
100
101         /*
102          * mono_jit_info_table_find is not async safe since it calls into the AOT runtime to load information for
103          * missing methods (#13951). To work around this, we disable the AOT fallback. For this to work, the JIT needs
104          * to register the jit info for all GC critical methods after they are JITted/loaded.
105          */
106         ji = mono_jit_info_table_find_internal (domain, ip, FALSE, FALSE);
107         if (!ji)
108                 return FALSE;
109
110         return sgen_is_critical_method (mono_jit_info_get_method (ji));
111 }
112
113 static int
114 restart_threads_until_none_in_managed_allocator (void)
115 {
116         SgenThreadInfo *info;
117         int num_threads_died = 0;
118         int sleep_duration = -1;
119
120         for (;;) {
121                 int restart_count = 0, restarted_count = 0;
122                 /* restart all threads that stopped in the
123                    allocator */
124                 FOREACH_THREAD_SAFE (info) {
125                         gboolean result;
126                         if (info->client_info.skip || info->client_info.gc_disabled || info->client_info.suspend_done)
127                                 continue;
128                         if (mono_thread_info_is_live (info) &&
129                                         (!info->client_info.stack_start || info->client_info.in_critical_region || info->client_info.info.inside_critical_region ||
130                                         is_ip_in_managed_allocator (info->client_info.stopped_domain, info->client_info.stopped_ip))) {
131                                 binary_protocol_thread_restart ((gpointer)mono_thread_info_get_tid (info));
132                                 SGEN_LOG (3, "thread %p resumed.", (void*) (size_t) info->client_info.info.native_handle);
133                                 result = sgen_resume_thread (info);
134                                 if (result) {
135                                         ++restart_count;
136                                 } else {
137                                         info->client_info.skip = 1;
138                                 }
139                         } else {
140                                 /* we set the stopped_ip to
141                                    NULL for threads which
142                                    we're not restarting so
143                                    that we can easily identify
144                                    the others */
145                                 info->client_info.stopped_ip = NULL;
146                                 info->client_info.stopped_domain = NULL;
147                                 info->client_info.suspend_done = TRUE;
148                         }
149                 } END_FOREACH_THREAD_SAFE
150                 /* if no threads were restarted, we're done */
151                 if (restart_count == 0)
152                         break;
153
154                 /* wait for the threads to signal their restart */
155                 sgen_wait_for_suspend_ack (restart_count);
156
157                 if (sleep_duration < 0) {
158                         mono_thread_info_yield ();
159                         sleep_duration = 0;
160                 } else {
161                         g_usleep (sleep_duration);
162                         sleep_duration += 10;
163                 }
164
165                 /* stop them again */
166                 FOREACH_THREAD (info) {
167                         gboolean result;
168                         if (info->client_info.skip || info->client_info.stopped_ip == NULL)
169                                 continue;
170                         result = sgen_suspend_thread (info);
171
172                         if (result) {
173                                 ++restarted_count;
174                         } else {
175                                 info->client_info.skip = 1;
176                         }
177                 } END_FOREACH_THREAD
178                 /* some threads might have died */
179                 num_threads_died += restart_count - restarted_count;
180                 /* wait for the threads to signal their suspension
181                    again */
182                 sgen_wait_for_suspend_ack (restarted_count);
183         }
184
185         return num_threads_died;
186 }
187
188 static void
189 acquire_gc_locks (void)
190 {
191         LOCK_INTERRUPTION;
192         mono_thread_info_suspend_lock ();
193 }
194
195 static void
196 release_gc_locks (void)
197 {
198         mono_thread_info_suspend_unlock ();
199         UNLOCK_INTERRUPTION;
200 }
201
202 static TV_DECLARE (stop_world_time);
203 static unsigned long max_pause_usec = 0;
204
205 static guint64 time_stop_world;
206 static guint64 time_restart_world;
207
208 /* LOCKING: assumes the GC lock is held */
209 void
210 sgen_client_stop_world (int generation)
211 {
212         TV_DECLARE (end_handshake);
213
214         /* notify the profiler of the leftovers */
215         /* FIXME this is the wrong spot at we can STW for non collection reasons. */
216         if (G_UNLIKELY (mono_profiler_events & MONO_PROFILE_GC_MOVES))
217                 mono_sgen_gc_event_moves ();
218
219         acquire_gc_locks ();
220
221         /* We start to scan after locks are taking, this ensures we won't be interrupted. */
222         sgen_process_togglerefs ();
223
224         update_current_thread_stack (&generation);
225
226         sgen_global_stop_count++;
227         SGEN_LOG (3, "stopping world n %d from %p %p", sgen_global_stop_count, mono_thread_info_current (), (gpointer)mono_native_thread_id_get ());
228         TV_GETTIME (stop_world_time);
229
230         if (mono_thread_info_unified_management_enabled ()) {
231                 sgen_unified_suspend_stop_world ();
232         } else {
233                 int count, dead;
234                 count = sgen_thread_handshake (TRUE);
235                 dead = restart_threads_until_none_in_managed_allocator ();
236                 if (count < dead)
237                         g_error ("More threads have died (%d) that been initialy suspended %d", dead, count);
238         }
239
240         SGEN_LOG (3, "world stopped");
241
242         TV_GETTIME (end_handshake);
243         time_stop_world += TV_ELAPSED (stop_world_time, end_handshake);
244
245         sgen_memgov_collection_start (generation);
246         if (sgen_need_bridge_processing ())
247                 sgen_bridge_reset_data ();
248 }
249
250 /* LOCKING: assumes the GC lock is held */
251 void
252 sgen_client_restart_world (int generation, GGTimingInfo *timing)
253 {
254         SgenThreadInfo *info;
255         TV_DECLARE (end_sw);
256         TV_DECLARE (start_handshake);
257         TV_DECLARE (end_bridge);
258         unsigned long usec, bridge_usec;
259
260         /* notify the profiler of the leftovers */
261         /* FIXME this is the wrong spot at we can STW for non collection reasons. */
262         if (G_UNLIKELY (mono_profiler_events & MONO_PROFILE_GC_MOVES))
263                 mono_sgen_gc_event_moves ();
264
265         FOREACH_THREAD (info) {
266                 info->client_info.stack_start = NULL;
267 #ifdef USE_MONO_CTX
268                 memset (&info->client_info.ctx, 0, sizeof (MonoContext));
269 #else
270                 memset (&info->client_info.regs, 0, sizeof (info->client_info.regs));
271 #endif
272         } END_FOREACH_THREAD
273
274         TV_GETTIME (start_handshake);
275
276         if (mono_thread_info_unified_management_enabled ())
277                 sgen_unified_suspend_restart_world ();
278         else
279                 sgen_thread_handshake (FALSE);
280
281         TV_GETTIME (end_sw);
282         time_restart_world += TV_ELAPSED (start_handshake, end_sw);
283         usec = TV_ELAPSED (stop_world_time, end_sw);
284         max_pause_usec = MAX (usec, max_pause_usec);
285
286         SGEN_LOG (2, "restarted (pause time: %d usec, max: %d)", (int)usec, (int)max_pause_usec);
287
288         /*
289          * We must release the thread info suspend lock after doing
290          * the thread handshake.  Otherwise, if the GC stops the world
291          * and a thread is in the process of starting up, but has not
292          * yet registered (it's not in the thread_list), it is
293          * possible that the thread does register while the world is
294          * stopped.  When restarting the GC will then try to restart
295          * said thread, but since it never got the suspend signal, it
296          * cannot answer the restart signal, so a deadlock results.
297          */
298         release_gc_locks ();
299
300         TV_GETTIME (end_bridge);
301         bridge_usec = TV_ELAPSED (end_sw, end_bridge);
302
303         if (timing) {
304                 timing [0].stw_time = usec;
305                 timing [0].bridge_time = bridge_usec;
306         }
307 }
308
309 void
310 mono_sgen_init_stw (void)
311 {
312         mono_counters_register ("World stop", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_stop_world);
313         mono_counters_register ("World restart", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_restart_world);
314 }
315
316 /* Unified suspend code */
317
318 static gboolean
319 sgen_is_thread_in_current_stw (SgenThreadInfo *info)
320 {
321         /*
322         A thread explicitly asked to be skiped because it holds no managed state.
323         This is used by TP and finalizer threads.
324         FIXME Use an atomic variable for this to avoid everyone taking the GC LOCK.
325         */
326         if (info->client_info.gc_disabled) {
327                 return FALSE;
328         }
329
330         /*
331         We have detected that this thread is failing/dying, ignore it.
332         FIXME: can't we merge this with thread_is_dying?
333         */
334         if (info->client_info.skip) {
335                 return FALSE;
336         }
337
338         /*
339         Suspending the current thread will deadlock us, bad idea.
340         */
341         if (info == mono_thread_info_current ()) {
342                 return FALSE;
343         }
344
345         /*
346         We can't suspend the workers that will do all the heavy lifting.
347         FIXME Use some state bit in SgenThreadInfo for this.
348         */
349         if (sgen_thread_pool_is_thread_pool_thread (mono_thread_info_get_tid (info))) {
350                 return FALSE;
351         }
352
353         /*
354         The thread has signaled that it started to detach, ignore it.
355         FIXME: can't we merge this with skip
356         */
357         if (!mono_thread_info_is_live (info)) {
358                 return FALSE;
359         }
360
361         return TRUE;
362 }
363
364 static void
365 update_sgen_info (SgenThreadInfo *info)
366 {
367         char *stack_start;
368
369         /* Once we remove the old suspend code, we should move sgen to directly access the state in MonoThread */
370         info->client_info.stopped_domain = mono_thread_info_tls_get (info, TLS_KEY_DOMAIN);
371         info->client_info.stopped_ip = (gpointer) MONO_CONTEXT_GET_IP (&mono_thread_info_get_suspend_state (info)->ctx);
372         stack_start = (char*)MONO_CONTEXT_GET_SP (&mono_thread_info_get_suspend_state (info)->ctx) - REDZONE_SIZE;
373
374         /* altstack signal handler, sgen can't handle them, mono-threads should have handled this. */
375         if (stack_start < (char*)info->client_info.stack_start_limit || stack_start >= (char*)info->client_info.stack_end)
376                 g_error ("BAD STACK");
377
378         info->client_info.stack_start = stack_start;
379 #ifdef USE_MONO_CTX
380         info->client_info.ctx = mono_thread_info_get_suspend_state (info)->ctx;
381 #else
382         g_assert_not_reached ();
383 #endif
384 }
385
386 static void
387 sgen_unified_suspend_stop_world (void)
388 {
389         int restart_counter;
390         SgenThreadInfo *info;
391         int sleep_duration = -1;
392
393         mono_threads_begin_global_suspend ();
394         THREADS_STW_DEBUG ("[GC-STW-BEGIN] *** BEGIN SUSPEND *** \n");
395
396         FOREACH_THREAD_SAFE (info) {
397                 info->client_info.skip = FALSE;
398                 info->client_info.suspend_done = FALSE;
399                 if (sgen_is_thread_in_current_stw (info)) {
400                         info->client_info.skip = !mono_thread_info_begin_suspend (info, FALSE);
401                         THREADS_STW_DEBUG ("[GC-STW-BEGIN-SUSPEND] SUSPEND thread %p skip %d\n", mono_thread_info_get_tid (info), info->client_info.skip);
402                 } else {
403                         THREADS_STW_DEBUG ("[GC-STW-BEGIN-SUSPEND] IGNORE thread %p skip %d\n", mono_thread_info_get_tid (info), info->client_info.skip);
404                 }
405         } END_FOREACH_THREAD_SAFE
406
407         mono_thread_info_current ()->client_info.suspend_done = TRUE;
408         mono_threads_wait_pending_operations ();
409
410         for (;;) {
411                 restart_counter = 0;
412                 FOREACH_THREAD_SAFE (info) {
413                         if (info->client_info.suspend_done || !sgen_is_thread_in_current_stw (info)) {
414                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] IGNORE thread %p not been processed done %d current %d\n", mono_thread_info_get_tid (info), info->client_info.suspend_done, !sgen_is_thread_in_current_stw (info));
415                                 continue;
416                         }
417
418                         /*
419                         All threads that reach here are pristine suspended. This means the following:
420
421                         - We haven't accepted the previous suspend as good.
422                         - We haven't gave up on it for this STW (it's either bad or asked not to)
423                         */
424                         if (!mono_thread_info_check_suspend_result (info)) {
425                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] SKIP thread %p failed to finish to suspend\n", mono_thread_info_get_tid (info));
426                                 info->client_info.skip = TRUE;
427                         } else if (mono_thread_info_in_critical_location (info)) {
428                                 gboolean res;
429                                 g_assert (mono_thread_info_suspend_count (info) == 1);
430                                 res = mono_thread_info_begin_resume (info);
431                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] RESTART thread %p skip %d\n", mono_thread_info_get_tid (info), res);
432                                 if (res)
433                                         ++restart_counter;
434                                 else
435                                         info->client_info.skip = TRUE;
436                         } else {
437                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] DONE thread %p deemed fully suspended\n", mono_thread_info_get_tid (info));
438                                 g_assert (!info->client_info.in_critical_region);
439                                 info->client_info.suspend_done = TRUE;
440                         }
441                 } END_FOREACH_THREAD_SAFE
442
443                 if (restart_counter == 0)
444                         break;
445                 mono_threads_wait_pending_operations ();
446
447                 if (sleep_duration < 0) {
448 #ifdef HOST_WIN32
449                         SwitchToThread ();
450 #else
451                         sched_yield ();
452 #endif
453                         sleep_duration = 0;
454                 } else {
455                         g_usleep (sleep_duration);
456                         sleep_duration += 10;
457                 }
458
459                 FOREACH_THREAD_SAFE (info) {
460                         if (sgen_is_thread_in_current_stw (info) && mono_thread_info_is_running (info)) {
461                                 gboolean res = mono_thread_info_begin_suspend (info, FALSE);
462                                 THREADS_STW_DEBUG ("[GC-STW-RESTART] SUSPEND thread %p skip %d\n", mono_thread_info_get_tid (info), res);
463                                 if (!res)
464                                         info->client_info.skip = TRUE;
465                         }
466                 } END_FOREACH_THREAD_SAFE
467
468                 mono_threads_wait_pending_operations ();
469         }
470
471         FOREACH_THREAD_SAFE (info) {
472                 if (sgen_is_thread_in_current_stw (info)) {
473                         THREADS_STW_DEBUG ("[GC-STW-SUSPEND-END] thread %p is suspended\n", mono_thread_info_get_tid (info));
474                         g_assert (info->client_info.suspend_done);
475                         update_sgen_info (info);
476                 } else {
477                         g_assert (!info->client_info.suspend_done || info == mono_thread_info_current ());
478                 }
479         } END_FOREACH_THREAD_SAFE
480 }
481
482 static void
483 sgen_unified_suspend_restart_world (void)
484 {
485         SgenThreadInfo *info;
486
487         THREADS_STW_DEBUG ("[GC-STW-END] *** BEGIN RESUME ***\n");
488         FOREACH_THREAD_SAFE (info) {
489                 if (sgen_is_thread_in_current_stw (info)) {
490                         g_assert (mono_thread_info_begin_resume (info));
491                         THREADS_STW_DEBUG ("[GC-STW-RESUME-WORLD] RESUME thread %p\n", mono_thread_info_get_tid (info));
492                 } else {
493                         THREADS_STW_DEBUG ("[GC-STW-RESUME-WORLD] IGNORE thread %p\n", mono_thread_info_get_tid (info));
494                 }
495         } END_FOREACH_THREAD_SAFE
496
497         mono_threads_wait_pending_operations ();
498         mono_threads_end_global_suspend ();
499 }
500 #endif