[sgen] Remove an incorrect assertion in the workers code and explain.
[mono.git] / mono / metadata / sgen-gc.c
index 020cb5eb1c4de7ae22e46d51c79efe83fe99ec0f..0f389b2a8fe2a0ec96b355d90bd919354cdca8a9 100644 (file)
 #include "metadata/sgen-cardtable.h"
 #include "metadata/sgen-protocol.h"
 #include "metadata/sgen-archdep.h"
+#include "metadata/sgen-bridge.h"
 #include "metadata/mono-gc.h"
 #include "metadata/method-builder.h"
 #include "metadata/profiler-private.h"
@@ -745,6 +746,10 @@ static int moved_objects_idx = 0;
 /* Vtable of the objects used to fill out nursery fragments before a collection */
 static MonoVTable *array_fill_vtable;
 
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+pthread_t main_gc_thread = NULL;
+#endif
+
 /*
  * ######################################################################
  * ########  Heap size accounting
@@ -825,7 +830,7 @@ static void find_pinning_ref_from_thread (char *obj, size_t size);
 static void update_current_thread_stack (void *start);
 static void finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue);
 static void add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation);
-static void null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue);
+static void null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue);
 static void null_links_for_domain (MonoDomain *domain, int generation);
 static gboolean search_fragment_for_size (size_t size);
 static int search_fragment_for_size_range (size_t desired_size, size_t minimum_size);
@@ -836,7 +841,7 @@ static void optimize_pin_queue (int start_slot);
 static void clear_remsets (void);
 static void clear_tlabs (void);
 static void sort_addresses (void **array, int size);
-static void drain_gray_stack (GrayQueue *queue);
+static gboolean drain_gray_stack (GrayQueue *queue, int max_objs);
 static void finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *queue);
 static gboolean need_major_collection (mword space_needed);
 static void major_collection (const char *reason);
@@ -1697,8 +1702,8 @@ mono_sgen_add_to_global_remset (gpointer ptr)
  * frequently after each object is copied, to achieve better locality and cache
  * usage.
  */
-static void
-drain_gray_stack (GrayQueue *queue)
+static gboolean
+drain_gray_stack (GrayQueue *queue, int max_objs)
 {
        char *obj;
 
@@ -1706,21 +1711,26 @@ drain_gray_stack (GrayQueue *queue)
                for (;;) {
                        GRAY_OBJECT_DEQUEUE (queue, obj);
                        if (!obj)
-                               break;
+                               return TRUE;
                        DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
                        major_collector.minor_scan_object (obj, queue);
                }
        } else {
+               int i;
+
                if (major_collector.is_parallel && queue == &workers_distribute_gray_queue)
-                       return;
+                       return TRUE;
 
-               for (;;) {
-                       GRAY_OBJECT_DEQUEUE (queue, obj);
-                       if (!obj)
-                               break;
-                       DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
-                       major_collector.major_scan_object (obj, queue);
-               }
+               do {
+                       for (i = 0; i != max_objs; ++i) {
+                               GRAY_OBJECT_DEQUEUE (queue, obj);
+                               if (!obj)
+                                       return TRUE;
+                               DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
+                               major_collector.major_scan_object (obj, queue);
+                       }
+               } while (max_objs < 0);
+               return FALSE;
        }
 }
 
@@ -2098,7 +2108,7 @@ precisely_scan_objects_from (CopyOrMarkObjectFunc copy_func, void** start_root,
                        if ((desc & 1) && *start_root) {
                                copy_func (start_root, queue);
                                DEBUG (9, fprintf (gc_debug_file, "Overwrote root at %p with %p\n", start_root, *start_root));
-                               drain_gray_stack (queue);
+                               drain_gray_stack (queue, -1);
                        }
                        desc >>= 1;
                        start_root++;
@@ -2116,7 +2126,7 @@ precisely_scan_objects_from (CopyOrMarkObjectFunc copy_func, void** start_root,
                                if ((bmap & 1) && *objptr) {
                                        copy_func (objptr, queue);
                                        DEBUG (9, fprintf (gc_debug_file, "Overwrote root at %p with %p\n", objptr, *objptr));
-                                       drain_gray_stack (queue);
+                                       drain_gray_stack (queue, -1);
                                }
                                bmap >>= 1;
                                ++objptr;
@@ -2455,6 +2465,28 @@ get_finalize_entry_hash_table (int generation)
        }
 }
 
+static MonoObject **finalized_array = NULL;
+static int finalized_array_capacity = 0;
+static int finalized_array_entries = 0;
+
+static void
+bridge_register_finalized_object (MonoObject *object)
+{
+       if (!finalized_array)
+               return;
+
+       if (finalized_array_entries >= finalized_array_capacity) {
+               MonoObject **new_array;
+               g_assert (finalized_array_entries == finalized_array_capacity);
+               finalized_array_capacity *= 2;
+               new_array = mono_sgen_alloc_internal_dynamic (sizeof (MonoObject*) * finalized_array_capacity, INTERNAL_MEM_BRIDGE_DATA);
+               memcpy (new_array, finalized_array, sizeof (MonoObject*) * finalized_array_entries);
+               mono_sgen_free_internal_dynamic (finalized_array, sizeof (MonoObject*) * finalized_array_entries, INTERNAL_MEM_BRIDGE_DATA);
+               finalized_array = new_array;
+       }
+       finalized_array [finalized_array_entries++] = object;
+}
+
 static void
 finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *queue)
 {
@@ -2462,6 +2494,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
        TV_DECLARE (btv);
        int fin_ready;
        int ephemeron_rounds = 0;
+       int num_loops;
        CopyOrMarkObjectFunc copy_func = current_collection_generation == GENERATION_NURSERY ? major_collector.copy_object : major_collector.copy_or_mark_object;
 
        /*
@@ -2477,9 +2510,24 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         *   To achieve better cache locality and cache usage, we drain the gray stack 
         * frequently, after each object is copied, and just finish the work here.
         */
-       drain_gray_stack (queue);
+       drain_gray_stack (queue, -1);
        TV_GETTIME (atv);
        DEBUG (2, fprintf (gc_debug_file, "%s generation done\n", generation_name (generation)));
+
+       /*
+       We must clear weak links that don't track resurrection before processing object ready for
+       finalization so they can be cleared before that.
+       */
+       null_link_in_range (copy_func, start_addr, end_addr, generation, TRUE, queue);
+       if (generation == GENERATION_OLD)
+               null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, TRUE, queue);
+
+       if (finalized_array == NULL && mono_sgen_need_bridge_processing ()) {
+               finalized_array_capacity = 32;
+               finalized_array = mono_sgen_alloc_internal_dynamic (sizeof (MonoObject*) * finalized_array_capacity, INTERNAL_MEM_BRIDGE_DATA);
+       }
+       finalized_array_entries = 0;
+
        /* walk the finalization queue and move also the objects that need to be
         * finalized: use the finalized objects as new roots so the objects they depend
         * on are also not reclaimed. As with the roots above, only objects in the nursery
@@ -2487,6 +2535,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         * We need a loop here, since objects ready for finalizers may reference other objects
         * that are fin-ready. Speedup with a flag?
         */
+       num_loops = 0;
        do {
                /*
                 * Walk the ephemeron tables marking all values with reachable keys. This must be completely done
@@ -2498,7 +2547,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
                int done_with_ephemerons = 0;
                do {
                        done_with_ephemerons = mark_ephemerons_in_range (copy_func, start_addr, end_addr, queue);
-                       drain_gray_stack (queue);
+                       drain_gray_stack (queue, -1);
                        ++ephemeron_rounds;
                } while (!done_with_ephemerons);
 
@@ -2507,11 +2556,20 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
                if (generation == GENERATION_OLD)
                        finalize_in_range (copy_func, nursery_start, nursery_real_end, GENERATION_NURSERY, queue);
 
+               if (fin_ready != num_ready_finalizers) {
+                       ++num_loops;
+                       if (finalized_array != NULL)
+                               mono_sgen_bridge_processing (finalized_array_entries, finalized_array);
+               }
+
                /* drain the new stack that might have been created */
                DEBUG (6, fprintf (gc_debug_file, "Precise scan of gray area post fin\n"));
-               drain_gray_stack (queue);
+               drain_gray_stack (queue, -1);
        } while (fin_ready != num_ready_finalizers);
 
+       if (mono_sgen_need_bridge_processing ())
+               g_assert (num_loops <= 1);
+
        /*
         * Clear ephemeron pairs with unreachable keys.
         * We pass the copy func so we can figure out if an array was promoted or not.
@@ -2531,12 +2589,12 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         */
        g_assert (gray_object_queue_is_empty (queue));
        for (;;) {
-               null_link_in_range (copy_func, start_addr, end_addr, generation, queue);
+               null_link_in_range (copy_func, start_addr, end_addr, generation, FALSE, queue);
                if (generation == GENERATION_OLD)
-                       null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, queue);
+                       null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, FALSE, queue);
                if (gray_object_queue_is_empty (queue))
                        break;
-               drain_gray_stack (queue);
+               drain_gray_stack (queue, -1);
        }
 
        g_assert (gray_object_queue_is_empty (queue));
@@ -3017,7 +3075,7 @@ collect_nursery (size_t requested_size)
                time_minor_scan_card_table += TV_ELAPSED_MS (atv, btv);
        }
 
-       drain_gray_stack (&gray_queue);
+       drain_gray_stack (&gray_queue, -1);
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
                report_registered_roots ();
@@ -3129,8 +3187,7 @@ major_do_collection (const char *reason)
        binary_protocol_collection (GENERATION_OLD);
        check_scan_starts ();
        gray_object_queue_init (&gray_queue, mono_sgen_get_unmanaged_allocator ());
-       if (major_collector.is_parallel)
-               gray_object_queue_init (&workers_distribute_gray_queue, mono_sgen_get_unmanaged_allocator ());
+       workers_init_distribute_gray_queue ();
 
        degraded_mode = 0;
        DEBUG (1, fprintf (gc_debug_file, "Start major collection %d\n", num_major_gcs));
@@ -3214,7 +3271,12 @@ major_do_collection (const char *reason)
 
        major_collector.init_to_space ();
 
-       workers_start_all_workers (1);
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+       main_gc_thread = pthread_self ();
+#endif
+
+       workers_start_all_workers ();
+       workers_start_marking ();
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
                report_registered_roots ();
@@ -3252,12 +3314,15 @@ major_do_collection (const char *reason)
        if (major_collector.is_parallel) {
                while (!gray_object_queue_is_empty (WORKERS_DISTRIBUTE_GRAY_QUEUE)) {
                        workers_distribute_gray_queue_sections ();
-                       usleep (2000);
+                       usleep (1000);
                }
        }
-       workers_change_num_working (-1);
        workers_join ();
 
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+       main_gc_thread = NULL;
+#endif
+
        if (major_collector.is_parallel)
                g_assert (gray_object_queue_is_empty (&gray_queue));
 
@@ -3266,6 +3331,15 @@ major_do_collection (const char *reason)
        TV_GETTIME (atv);
        time_major_finish_gray_stack += TV_ELAPSED_MS (btv, atv);
 
+       /*
+        * The (single-threaded) finalization code might have done
+        * some copying/marking so we can only reset the GC thread's
+        * worker data here instead of earlier when we joined the
+        * workers.
+        */
+       if (major_collector.reset_worker_data)
+               major_collector.reset_worker_data (workers_gc_thread_data.major_collector_data);
+
        if (objects_pinned) {
                /*This is slow, but we just OOM'd*/
                mono_sgen_pin_queue_clear_discarded_entries (nursery_section, old_next_pin_slot);
@@ -4073,6 +4147,7 @@ finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int g
                                        num_ready_finalizers++;
                                        hash_table->num_registered--;
                                        queue_finalization_entry (entry);
+                                       bridge_register_finalized_object ((MonoObject*)copy);
                                        /* Make it survive */
                                        from = entry->object;
                                        entry->object = copy;
@@ -4127,6 +4202,16 @@ object_is_reachable (char *object, char *start, char *end)
        return !object_is_fin_ready (object) || major_collector.is_object_live (object);
 }
 
+gboolean
+mono_sgen_object_is_live (void *obj)
+{
+       if (ptr_in_nursery (obj))
+               return object_is_pinned (obj);
+       if (current_collection_generation == GENERATION_NURSERY)
+               return FALSE;
+       return major_collector.is_object_live (obj);
+}
+
 /* LOCKING: requires that the GC lock is held */
 static void
 null_ephemerons_for_domain (MonoDomain *domain)
@@ -4288,7 +4373,7 @@ mark_ephemerons_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end
 
 /* LOCKING: requires that the GC lock is held */
 static void
-null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue)
+null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue)
 {
        DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
        DisappearingLink **disappearing_link_hash = hash->table;
@@ -4300,10 +4385,28 @@ null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int
        for (i = 0; i < disappearing_link_hash_size; ++i) {
                prev = NULL;
                for (entry = disappearing_link_hash [i]; entry;) {
-                       char *object = DISLINK_OBJECT (entry);
+                       char *object;
+                       gboolean track = DISLINK_TRACK (entry);
+
+                       /*
+                        * Tracked references are processed after
+                        * finalization handling whereas standard weak
+                        * references are processed before.  If an
+                        * object is still not marked after finalization
+                        * handling it means that it either doesn't have
+                        * a finalizer or the finalizer has already run,
+                        * so we must null a tracking reference.
+                        */
+                       if (track == before_finalization) {
+                               prev = entry;
+                               entry = entry->next;
+                               continue;
+                       }
+
+                       object = DISLINK_OBJECT (entry);
+
                        if (object >= start && object < end && !major_collector.is_object_live (object)) {
-                               gboolean track = DISLINK_TRACK (entry);
-                               if (!track && object_is_fin_ready (object)) {
+                               if (object_is_fin_ready (object)) {
                                        void **p = entry->link;
                                        DisappearingLink *old;
                                        *p = NULL;
@@ -4350,14 +4453,7 @@ null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int
 
                                                continue;
                                        } else {
-                                               /* We set the track resurrection bit to
-                                                * FALSE if the object is to be finalized
-                                                * so that the object can be collected in
-                                                * the next cycle (i.e. after it was
-                                                * finalized).
-                                                */
-                                               *entry->link = HIDE_POINTER (copy,
-                                                       object_is_fin_ready (object) ? FALSE : track);
+                                               *entry->link = HIDE_POINTER (copy, track);
                                                DEBUG (5, fprintf (gc_debug_file, "Updated dislink at %p to %p\n", entry->link, DISLINK_OBJECT (entry)));
                                        }
                                }
@@ -4858,7 +4954,7 @@ mono_gc_deregister_root (char* addr)
 /* FIXME: handle large/small config */
 #define HASH_PTHREAD_T(id) (((unsigned int)(id) >> 4) * 2654435761u)
 
-static SgenThreadInfo* thread_table [THREAD_HASH_SIZE];
+SgenThreadInfo* thread_table [THREAD_HASH_SIZE];
 
 #if USE_SIGNAL_BASED_START_STOP_WORLD
 
@@ -4867,14 +4963,13 @@ static MonoSemType *suspend_ack_semaphore_ptr;
 static unsigned int global_stop_count = 0;
 
 static sigset_t suspend_signal_mask;
+
+#ifdef USE_MONO_CTX
+static MonoContext cur_thread_ctx = {0};
+#else
 static mword cur_thread_regs [ARCH_NUM_REGS] = {0};
+#endif
 
-/* LOCKING: assumes the GC lock is held */
-SgenThreadInfo**
-mono_sgen_get_thread_table (void)
-{
-       return thread_table;
-}
 
 SgenThreadInfo*
 mono_sgen_thread_info_lookup (ARCH_THREAD_TYPE id)
@@ -4892,13 +4987,23 @@ mono_sgen_thread_info_lookup (ARCH_THREAD_TYPE id)
 static void
 update_current_thread_stack (void *start)
 {
+       int stack_guard = 0;
+#ifdef USE_MONO_CTX
+       void *ptr = NULL;
+#else
        void *ptr = cur_thread_regs;
+#endif
        SgenThreadInfo *info = mono_sgen_thread_info_lookup (ARCH_GET_THREAD ());
        
-       info->stack_start = align_pointer (&ptr);
+       info->stack_start = align_pointer (&stack_guard);
        g_assert (info->stack_start >= info->stack_start_limit && info->stack_start < info->stack_end);
+#ifdef USE_MONO_CTX
+       MONO_CONTEXT_GET_CURRENT (cur_thread_ctx);
+       info->monoctx = &cur_thread_ctx;
+#else
        ARCH_STORE_REGS (ptr);
        info->stopped_regs = ptr;
+#endif
        if (gc_callbacks.thread_suspend_func)
                gc_callbacks.thread_suspend_func (info->runtime_data, NULL);
 }
@@ -4939,41 +5044,39 @@ static int
 restart_threads_until_none_in_managed_allocator (void)
 {
        SgenThreadInfo *info;
-       int i, result, num_threads_died = 0;
+       int result, num_threads_died = 0;
        int sleep_duration = -1;
 
        for (;;) {
                int restart_count = 0, restarted_count = 0;
                /* restart all threads that stopped in the
                   allocator */
-               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-                       for (info = thread_table [i]; info; info = info->next) {
-                               if (info->skip)
-                                       continue;
-                               if (!info->stack_start || info->in_critical_region ||
-                                               is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
-                                       binary_protocol_thread_restart ((gpointer)info->id);
+               FOREACH_THREAD (info) {
+                       if (info->skip)
+                               continue;
+                       if (!info->stack_start || info->in_critical_region ||
+                                       is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
+                               binary_protocol_thread_restart ((gpointer)info->id);
 #if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-                                       result = thread_resume (pthread_mach_thread_np (info->id));
+                               result = thread_resume (pthread_mach_thread_np (info->id));
 #else
-                                       result = pthread_kill (info->id, restart_signal_num);
+                               result = pthread_kill (info->id, restart_signal_num);
 #endif
-                                       if (result == 0) {
-                                               ++restart_count;
-                                       } else {
-                                               info->skip = 1;
-                                       }
+                               if (result == 0) {
+                                       ++restart_count;
                                } else {
-                                       /* we set the stopped_ip to
-                                          NULL for threads which
-                                          we're not restarting so
-                                          that we can easily identify
-                                          the others */
-                                       info->stopped_ip = NULL;
-                                       info->stopped_domain = NULL;
+                                       info->skip = 1;
                                }
+                       } else {
+                               /* we set the stopped_ip to
+                                  NULL for threads which
+                                  we're not restarting so
+                                  that we can easily identify
+                                  the others */
+                               info->stopped_ip = NULL;
+                               info->stopped_domain = NULL;
                        }
-               }
+               } END_FOREACH_THREAD
                /* if no threads were restarted, we're done */
                if (restart_count == 0)
                        break;
@@ -4994,22 +5097,20 @@ restart_threads_until_none_in_managed_allocator (void)
                }
 
                /* stop them again */
-               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-                       for (info = thread_table [i]; info; info = info->next) {
-                               if (info->skip || info->stopped_ip == NULL)
-                                       continue;
+               FOREACH_THREAD (info) {
+                       if (info->skip || info->stopped_ip == NULL)
+                               continue;
 #if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-                               result = thread_suspend (pthread_mach_thread_np (info->id));
+                       result = thread_suspend (pthread_mach_thread_np (info->id));
 #else
-                               result = pthread_kill (info->id, suspend_signal_num);
+                       result = pthread_kill (info->id, suspend_signal_num);
 #endif
-                               if (result == 0) {
-                                       ++restarted_count;
-                               } else {
-                                       info->skip = 1;
-                               }
+                       if (result == 0) {
+                               ++restarted_count;
+                       } else {
+                               info->skip = 1;
                        }
-               }
+               } END_FOREACH_THREAD
                /* some threads might have died */
                num_threads_died += restart_count - restarted_count;
 #if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
@@ -5032,7 +5133,11 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        pthread_t id;
        int stop_count;
        int old_errno = errno;
+#ifdef USE_MONO_CTX
+       MonoContext monoctx;
+#else
        gpointer regs [ARCH_NUM_REGS];
+#endif
        gpointer stack_start;
 
        id = pthread_self ();
@@ -5055,8 +5160,13 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        if (stack_start >= info->stack_start_limit && info->stack_start <= info->stack_end) {
                info->stack_start = stack_start;
 
+#ifdef USE_MONO_CTX
+               mono_sigctx_to_monoctx (context, &monoctx);
+               info->monoctx = &monoctx;
+#else
                ARCH_COPY_SIGCTX_REGS (regs, context);
                info->stopped_regs = regs;
+#endif
        } else {
                g_assert (!info->stack_start);
        }
@@ -5137,7 +5247,7 @@ stop_world (int generation)
 static int
 restart_world (int generation)
 {
-       int count, i;
+       int count;
        SgenThreadInfo *info;
        TV_DECLARE (end_sw);
        unsigned long usec;
@@ -5150,12 +5260,14 @@ restart_world (int generation)
                }
        }
        mono_profiler_gc_event (MONO_GC_EVENT_PRE_START_WORLD, generation);
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       info->stack_start = NULL;
-                       info->stopped_regs = NULL;
-               }
-       }
+       FOREACH_THREAD (info) {
+               info->stack_start = NULL;
+#ifdef USE_MONO_CTX
+               info->monoctx = NULL;
+#else
+               info->stopped_regs = NULL;
+#endif
+       } END_FOREACH_THREAD
 
        release_gc_locks ();
 
@@ -5213,57 +5325,62 @@ mono_gc_scan_object (void *obj)
 static void
 scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise)
 {
-       int i;
        SgenThreadInfo *info;
 
        scan_area_arg_start = start_nursery;
        scan_area_arg_end = end_nursery;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       if (info->skip) {
-                               DEBUG (3, fprintf (gc_debug_file, "Skipping dead thread %p, range: %p-%p, size: %td\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start));
-                               continue;
-                       }
-                       DEBUG (3, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %td, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
-                       if (gc_callbacks.thread_mark_func && !conservative_stack_mark)
-                               gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
-                       else if (!precise)
-                               conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
-
-                       if (!precise)
-                               conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
-                                               start_nursery, end_nursery, PIN_TYPE_STACK);
+       FOREACH_THREAD (info) {
+               if (info->skip) {
+                       DEBUG (3, fprintf (gc_debug_file, "Skipping dead thread %p, range: %p-%p, size: %td\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start));
+                       continue;
                }
-       }
+               DEBUG (3, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %td, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
+               if (gc_callbacks.thread_mark_func && !conservative_stack_mark)
+                       gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
+               else if (!precise)
+                       conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
+
+#ifdef USE_MONO_CTX
+               if (!precise)
+                       conservatively_pin_objects_from ((void**)info->monoctx, (void**)info->monoctx + ARCH_NUM_REGS,
+                               start_nursery, end_nursery, PIN_TYPE_STACK);
+#else
+               if (!precise)
+                       conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
+                                       start_nursery, end_nursery, PIN_TYPE_STACK);
+#endif
+       } END_FOREACH_THREAD
 }
 
 static void
 find_pinning_ref_from_thread (char *obj, size_t size)
 {
-       int i, j;
+       int j;
        SgenThreadInfo *info;
        char *endobj = obj + size;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       char **start = (char**)info->stack_start;
-                       if (info->skip)
-                               continue;
-                       while (start < (char**)info->stack_end) {
-                               if (*start >= obj && *start < endobj) {
-                                       DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in thread %p (id %p) at %p, stack: %p-%p\n", obj, info, (gpointer)info->id, start, info->stack_start, info->stack_end));
-                               }
-                               start++;
+       FOREACH_THREAD (info) {
+               char **start = (char**)info->stack_start;
+               if (info->skip)
+                       continue;
+               while (start < (char**)info->stack_end) {
+                       if (*start >= obj && *start < endobj) {
+                               DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in thread %p (id %p) at %p, stack: %p-%p\n", obj, info, (gpointer)info->id, start, info->stack_start, info->stack_end));
                        }
+                       start++;
+               }
 
-                       for (j = 0; j < ARCH_NUM_REGS; ++j) {
-                               mword w = (mword)info->stopped_regs [j];
+               for (j = 0; j < ARCH_NUM_REGS; ++j) {
+#ifdef USE_MONO_CTX
+                       mword w = ((mword*)info->monoctx) [j];
+#else
+                       mword w = (mword)info->stopped_regs [j];
+#endif
 
-                               if (w >= (mword)obj && w < (mword)obj + size)
-                                       DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in saved reg %d of thread %p (id %p)\n", obj, j, info, (gpointer)info->id));
-                       }
-               }
+                       if (w >= (mword)obj && w < (mword)obj + size)
+                               DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in saved reg %d of thread %p (id %p)\n", obj, j, info, (gpointer)info->id));
+               } END_FOREACH_THREAD
        }
 }
 
@@ -5398,12 +5515,10 @@ remset_stats (void)
        int i;
        mword *addresses, *bumper, *p, *r;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = remset->next)
-                               size += remset->store_next - remset->data;
-               }
-       }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = remset->next)
+                       size += remset->store_next - remset->data;
+       } END_FOREACH_THREAD
        for (remset = freed_thread_remsets; remset; remset = remset->next)
                size += remset->store_next - remset->data;
        for (remset = global_remset; remset; remset = remset->next)
@@ -5411,12 +5526,10 @@ remset_stats (void)
 
        bumper = addresses = mono_sgen_alloc_internal_dynamic (sizeof (mword) * size, INTERNAL_MEM_STATISTICS);
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = remset->next)
-                               bumper = collect_store_remsets (remset, bumper);
-               }
-       }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = remset->next)
+                       bumper = collect_store_remsets (remset, bumper);
+       } END_FOREACH_THREAD
        for (remset = global_remset; remset; remset = remset->next)
                bumper = collect_store_remsets (remset, bumper);
        for (remset = freed_thread_remsets; remset; remset = remset->next)
@@ -5517,27 +5630,25 @@ scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
        generic_store_remsets = NULL;
 
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       RememberedSet *next;
-                       int j;
-                       for (remset = info->remset; remset; remset = next) {
-                               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
-                               for (p = remset->data; p < remset->store_next;)
-                                       p = handle_remset (p, start_nursery, end_nursery, FALSE, queue);
-                               remset->store_next = remset->data;
-                               next = remset->next;
-                               remset->next = NULL;
-                               if (remset != info->remset) {
-                                       DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
-                               }
+       FOREACH_THREAD (info) {
+               RememberedSet *next;
+               int j;
+               for (remset = info->remset; remset; remset = next) {
+                       DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
+                       for (p = remset->data; p < remset->store_next;)
+                               p = handle_remset (p, start_nursery, end_nursery, FALSE, queue);
+                       remset->store_next = remset->data;
+                       next = remset->next;
+                       remset->next = NULL;
+                       if (remset != info->remset) {
+                               DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
+                               mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
                        }
-                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j)
-                               handle_remset ((mword*)*info->store_remset_buffer_addr + j + 1, start_nursery, end_nursery, FALSE, queue);
-                       clear_thread_store_remset_buffer (info);
                }
-       }
+               for (j = 0; j < *info->store_remset_buffer_index_addr; ++j)
+                       handle_remset ((mword*)*info->store_remset_buffer_addr + j + 1, start_nursery, end_nursery, FALSE, queue);
+               clear_thread_store_remset_buffer (info);
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        while (freed_thread_remsets) {
@@ -5561,7 +5672,6 @@ scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
 static void
 clear_remsets (void)
 {
-       int i;
        SgenThreadInfo *info;
        RememberedSet *remset, *next;
 
@@ -5582,20 +5692,18 @@ clear_remsets (void)
                generic_store_remsets = gs_next;
        }
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = next) {
-                               remset->store_next = remset->data;
-                               next = remset->next;
-                               remset->next = NULL;
-                               if (remset != info->remset) {
-                                       DEBUG (3, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
-                               }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = next) {
+                       remset->store_next = remset->data;
+                       next = remset->next;
+                       remset->next = NULL;
+                       if (remset != info->remset) {
+                               DEBUG (3, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
+                               mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
                        }
-                       clear_thread_store_remset_buffer (info);
                }
-       }
+               clear_thread_store_remset_buffer (info);
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        while (freed_thread_remsets) {
@@ -5613,17 +5721,14 @@ static void
 clear_tlabs (void)
 {
        SgenThreadInfo *info;
-       int i;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       /* A new TLAB will be allocated when the thread does its first allocation */
-                       *info->tlab_start_addr = NULL;
-                       *info->tlab_next_addr = NULL;
-                       *info->tlab_temp_end_addr = NULL;
-                       *info->tlab_real_end_addr = NULL;
-               }
-       }
+       FOREACH_THREAD (info) {
+               /* A new TLAB will be allocated when the thread does its first allocation */
+               *info->tlab_start_addr = NULL;
+               *info->tlab_next_addr = NULL;
+               *info->tlab_temp_end_addr = NULL;
+               *info->tlab_real_end_addr = NULL;
+       } END_FOREACH_THREAD
 }
 
 /* LOCKING: assumes the GC lock is held */
@@ -5662,7 +5767,11 @@ gc_register_current_thread (void *addr)
        info->store_remset_buffer_index_addr = &STORE_REMSET_BUFFER_INDEX;
        info->stopped_ip = NULL;
        info->stopped_domain = NULL;
+#ifdef USE_MONO_CTX
+       info->monoctx = NULL;
+#else
        info->stopped_regs = NULL;
+#endif
 
        binary_protocol_thread_register ((gpointer)info->id);
 
@@ -5780,7 +5889,15 @@ unregister_current_thread (void)
 static void
 unregister_thread (void *k)
 {
-       g_assert (!mono_domain_get ());
+       /* If a delegate is passed to native code and invoked on a thread we dont
+        * know about, the jit will register it with mono_jit_thead_attach, but
+        * we have no way of knowing when that thread goes away.  SGen has a TSD
+        * so we assume that if the domain is still registered, we can detach
+        * the thread
+        */
+       if (mono_domain_get ())
+               mono_thread_detach (mono_thread_current ());
+
        LOCK_GC;
        unregister_current_thread ();
        UNLOCK_GC;
@@ -6464,23 +6581,21 @@ find_in_remsets (char *addr)
        }
 
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       int j;
-                       for (remset = info->remset; remset; remset = remset->next) {
-                               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
-                               for (p = remset->data; p < remset->store_next;) {
-                                       p = find_in_remset_loc (p, addr, &found);
-                                       if (found)
-                                               return TRUE;
-                               }
-                       }
-                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j) {
-                               if ((*info->store_remset_buffer_addr) [j + 1] == addr)
+       FOREACH_THREAD (info) {
+               int j;
+               for (remset = info->remset; remset; remset = remset->next) {
+                       DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
+                       for (p = remset->data; p < remset->store_next;) {
+                               p = find_in_remset_loc (p, addr, &found);
+                               if (found)
                                        return TRUE;
                        }
                }
-       }
+               for (j = 0; j < *info->store_remset_buffer_index_addr; ++j) {
+                       if ((*info->store_remset_buffer_addr) [j + 1] == addr)
+                               return TRUE;
+               }
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        for (remset = freed_thread_remsets; remset; remset = remset->next) {
@@ -6918,10 +7033,7 @@ mono_gc_base_init (void)
        char *major_collector_opt = NULL;
        struct sigaction sinfo;
        glong max_heap = 0;
-
-#ifdef PLATFORM_ANDROID
-       g_assert_not_reached ();
-#endif
+       int num_workers;
 
        /* the gc_initialized guard seems to imply this method is
           idempotent, but LOCK_INIT(gc_mutex) might not be.  It's
@@ -6972,10 +7084,8 @@ mono_gc_base_init (void)
                mono_sgen_marksweep_fixed_init (&major_collector);
        } else if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep-par")) {
                mono_sgen_marksweep_par_init (&major_collector);
-               workers_init (mono_cpu_count ());
        } else if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep-fixed-par")) {
                mono_sgen_marksweep_fixed_par_init (&major_collector);
-               workers_init (mono_cpu_count ());
        } else if (!strcmp (major_collector_opt, "copying")) {
                mono_sgen_copying_init (&major_collector);
        } else {
@@ -6989,6 +7099,11 @@ mono_gc_base_init (void)
        use_cardtable = FALSE;
 #endif
 
+       num_workers = mono_cpu_count ();
+       g_assert (num_workers > 0);
+       if (num_workers > 16)
+               num_workers = 16;
+
        /* Keep this the default for now */
        conservative_stack_mark = TRUE;
 
@@ -7025,6 +7140,26 @@ mono_gc_base_init (void)
                                }
                                continue;
                        }
+                       if (g_str_has_prefix (opt, "workers=")) {
+                               long val;
+                               char *endptr;
+                               if (!major_collector.is_parallel) {
+                                       fprintf (stderr, "The workers= option can only be used for parallel collectors.");
+                                       exit (1);
+                               }
+                               opt = strchr (opt, '=') + 1;
+                               val = strtol (opt, &endptr, 10);
+                               if (!*opt || *endptr) {
+                                       fprintf (stderr, "Cannot parse the workers= option value.");
+                                       exit (1);
+                               }
+                               if (val <= 0 || val > 16) {
+                                       fprintf (stderr, "The number of workers must be in the range 1 to 16.");
+                                       exit (1);
+                               }
+                               num_workers = (int)val;
+                               continue;
+                       }
                        if (g_str_has_prefix (opt, "stack-mark=")) {
                                opt = strchr (opt, '=') + 1;
                                if (!strcmp (opt, "precise")) {
@@ -7075,6 +7210,9 @@ mono_gc_base_init (void)
                g_strfreev (opts);
        }
 
+       if (major_collector.is_parallel)
+               workers_init (num_workers);
+
        if (major_collector_opt)
                g_free (major_collector_opt);
 
@@ -7137,6 +7275,9 @@ mono_gc_base_init (void)
                g_strfreev (opts);
        }
 
+       if (major_collector.post_param_init)
+               major_collector.post_param_init ();
+
        suspend_ack_semaphore_ptr = &suspend_ack_semaphore;
        MONO_SEM_INIT (&suspend_ack_semaphore, 0);
 
@@ -7551,7 +7692,7 @@ mono_gc_get_managed_array_allocator (MonoVTable *vtable, int rank)
                return NULL;
        if (collect_before_allocs)
                return NULL;
-       g_assert (!klass->has_finalize && !klass->marshalbyref);
+       g_assert (!mono_class_has_finalizer (klass) && !klass->marshalbyref);
 
        return mono_gc_get_managed_allocator_by_type (ATYPE_VECTOR);
 #else
@@ -7825,4 +7966,11 @@ mono_sgen_get_logfile (void)
        return gc_debug_file;
 }
 
+#ifdef HOST_WIN32
+BOOL APIENTRY mono_gc_dllmain (HMODULE module_handle, DWORD reason, LPVOID reserved)
+{
+       return TRUE;
+}
+#endif
+
 #endif /* HAVE_SGEN_GC */