Move thread management from sgen to utils. Move smr from MonoInternalThread to thread...
[mono.git] / mono / metadata / sgen-gc.c
index bb0f4b70390460e7a689cafaad2d04ed2051d66e..f0a263fe0a436db4c8993cf56aee3a4603d03c98 100644 (file)
 #ifdef __MACH__
 #define _XOPEN_SOURCE
 #endif
+
+#include "metadata/sgen-gc.h"
 #include "metadata/metadata-internals.h"
 #include "metadata/class-internals.h"
 #include "metadata/gc-internal.h"
 #include "metadata/object-internals.h"
 #include "metadata/threads.h"
-#include "metadata/sgen-gc.h"
 #include "metadata/sgen-cardtable.h"
 #include "metadata/sgen-protocol.h"
 #include "metadata/sgen-archdep.h"
+#include "metadata/sgen-bridge.h"
 #include "metadata/mono-gc.h"
 #include "metadata/method-builder.h"
 #include "metadata/profiler-private.h"
@@ -255,6 +257,8 @@ static gboolean conservative_stack_mark = FALSE;
 /* If set, do a plausibility check on the scan_starts before and after
    each collection */
 static gboolean do_scan_starts_check = FALSE;
+static gboolean disable_minor_collections = FALSE;
+static gboolean disable_major_collections = FALSE;
 
 #ifdef HEAVY_STATISTICS
 static long long stat_objects_alloced = 0;
@@ -298,6 +302,8 @@ static int stat_wbarrier_value_copy = 0;
 static int stat_wbarrier_object_copy = 0;
 #endif
 
+static long long stat_pinned_objects = 0;
+
 static long long time_minor_pre_collection_fragment_clear = 0;
 static long long time_minor_pinning = 0;
 static long long time_minor_scan_remsets = 0;
@@ -437,13 +443,14 @@ static gpointer global_remset_cache [2];
  */
 #define DEFAULT_REMSET_SIZE 1024
 static RememberedSet* alloc_remset (int size, gpointer id);
+static RememberedSet* alloc_global_remset (SgenInternalAllocator *alc, int size, gpointer id);
 
 #define object_is_forwarded    SGEN_OBJECT_IS_FORWARDED
 #define object_is_pinned       SGEN_OBJECT_IS_PINNED
 #define pin_object             SGEN_PIN_OBJECT
 #define unpin_object           SGEN_UNPIN_OBJECT
 
-#define ptr_in_nursery(p)      (SGEN_PTR_IN_NURSERY ((p), DEFAULT_NURSERY_BITS, nursery_start, nursery_real_end))
+#define ptr_in_nursery(p)      (SGEN_PTR_IN_NURSERY ((p), DEFAULT_NURSERY_BITS, nursery_start, nursery_end))
 
 #define LOAD_VTABLE    SGEN_LOAD_VTABLE
 
@@ -644,7 +651,7 @@ add_profile_gc_root (GCRootReport *report, void *object, int rtype, uintptr_t ex
 /* 
  * The current allocation cursors
  * We allocate objects in the nursery.
- * The nursery is the area between nursery_start and nursery_real_end.
+ * The nursery is the area between nursery_start and nursery_end.
  * Allocation is done from a Thread Local Allocation Buffer (TLAB). TLABs are allocated
  * from nursery fragments.
  * tlab_next is the pointer to the space inside the TLAB where the next object will 
@@ -660,6 +667,10 @@ add_profile_gc_root (GCRootReport *report, void *object, int rtype, uintptr_t ex
  * MAX(nursery_last_pinned_end, nursery_frag_real_end)
  */
 static char *nursery_start = NULL;
+static char *nursery_next = NULL;
+static char *nursery_frag_real_end = NULL;
+static char *nursery_end = NULL;
+static char *nursery_last_pinned_end = NULL;
 
 #ifdef HAVE_KW_THREAD
 #define TLAB_ACCESS_INIT
@@ -705,10 +716,6 @@ static __thread char **tlab_next_addr;
 static __thread char *stack_end;
 static __thread long *store_remset_buffer_index_addr;
 #endif
-static char *nursery_next = NULL;
-static char *nursery_frag_real_end = NULL;
-static char *nursery_real_end = NULL;
-static char *nursery_last_pinned_end = NULL;
 
 /* The size of a TLAB */
 /* The bigger the value, the less often we have to go to the slow path to allocate a new 
@@ -743,6 +750,10 @@ static int moved_objects_idx = 0;
 /* Vtable of the objects used to fill out nursery fragments before a collection */
 static MonoVTable *array_fill_vtable;
 
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+pthread_t main_gc_thread = NULL;
+#endif
+
 /*
  * ######################################################################
  * ########  Heap size accounting
@@ -813,7 +824,8 @@ typedef char* (*ScanObjectFunc) (char*, GrayQueue*);
 /* forward declarations */
 static int stop_world (int generation);
 static int restart_world (int generation);
-static void scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise);
+static void scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise, GrayQueue *queue);
+static void scan_from_global_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue);
 static void scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue);
 static void scan_from_registered_roots (CopyOrMarkObjectFunc copy_func, char *addr_start, char *addr_end, int root_type, GrayQueue *queue);
 static void scan_finalizer_entries (CopyOrMarkObjectFunc copy_func, FinalizeEntry *list, GrayQueue *queue);
@@ -823,18 +835,18 @@ static void find_pinning_ref_from_thread (char *obj, size_t size);
 static void update_current_thread_stack (void *start);
 static void finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue);
 static void add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation);
-static void null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue);
+static void null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue);
 static void null_links_for_domain (MonoDomain *domain, int generation);
-static gboolean search_fragment_for_size (size_t size);
-static int search_fragment_for_size_range (size_t desired_size, size_t minimum_size);
+static gboolean alloc_fragment_for_size (size_t size);
+static int alloc_fragment_for_size_range (size_t desired_size, size_t minimum_size);
 static void clear_nursery_fragments (char *next);
-static void pin_from_roots (void *start_nursery, void *end_nursery);
+static void pin_from_roots (void *start_nursery, void *end_nursery, GrayQueue *queue);
 static int pin_objects_from_addresses (GCMemSection *section, void **start, void **end, void *start_nursery, void *end_nursery, GrayQueue *queue);
 static void optimize_pin_queue (int start_slot);
 static void clear_remsets (void);
 static void clear_tlabs (void);
 static void sort_addresses (void **array, int size);
-static void drain_gray_stack (GrayQueue *queue);
+static gboolean drain_gray_stack (GrayQueue *queue, int max_objs);
 static void finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *queue);
 static gboolean need_major_collection (mword space_needed);
 static void major_collection (const char *reason);
@@ -850,7 +862,7 @@ static void check_scan_starts (void);
 static void check_for_xdomain_refs (void);
 static void dump_heap (const char *type, int num, const char *reason);
 
-void mono_gc_scan_for_specific_ref (MonoObject *key);
+void mono_gc_scan_for_specific_ref (MonoObject *key, gboolean precise);
 
 static void init_stats (void);
 
@@ -1165,7 +1177,7 @@ check_reference_for_xdomain (gpointer *ptr, char *obj, MonoDomain *domain)
                        o, o->vtable->klass->name_space, o->vtable->klass->name,
                        offset, field ? field->name : "",
                        ref, ref->vtable->klass->name_space, ref->vtable->klass->name, str ? str : "");
-       mono_gc_scan_for_specific_ref (o);
+       mono_gc_scan_for_specific_ref (o, TRUE);
        if (str)
                g_free (str);
 }
@@ -1181,6 +1193,8 @@ scan_object_for_xdomain_refs (char *start, mword size, void *data)
        #include "sgen-scan-object.h"
 }
 
+static gboolean scan_object_for_specific_ref_precise = TRUE;
+
 #undef HANDLE_PTR
 #define HANDLE_PTR(ptr,obj) do {               \
        if ((MonoObject*)*(ptr) == key) {       \
@@ -1197,7 +1211,19 @@ scan_object_for_specific_ref (char *start, MonoObject *key)
        if ((forwarded = SGEN_OBJECT_IS_FORWARDED (start)))
                start = forwarded;
 
-       #include "sgen-scan-object.h"
+       if (scan_object_for_specific_ref_precise) {
+               #include "sgen-scan-object.h"
+       } else {
+               mword *words = (mword*)start;
+               size_t size = safe_object_get_size ((MonoObject*)start);
+               int i;
+               for (i = 0; i < size / sizeof (mword); ++i) {
+                       if (words [i] == (mword)key) {
+                               g_print ("found possible ref to %p in object %p (%s) at offset %td\n",
+                                               key, start, safe_name (start), i * sizeof (mword));
+                       }
+               }
+       }
 }
 
 void
@@ -1308,11 +1334,13 @@ scan_roots_for_specific_ref (MonoObject *key, int root_type)
 }
 
 void
-mono_gc_scan_for_specific_ref (MonoObject *key)
+mono_gc_scan_for_specific_ref (MonoObject *key, gboolean precise)
 {
        RootRecord *root;
        int i;
 
+       scan_object_for_specific_ref_precise = precise;
+
        mono_sgen_scan_area_with_callback (nursery_section->data, nursery_section->end_data,
                        (IterateObjectCallbackFunc)scan_object_for_specific_ref_callback, key, TRUE);
 
@@ -1637,10 +1665,10 @@ global_remset_location_was_not_added (gpointer ptr)
  * lock must be held.  For serial collectors that is not necessary.
  */
 void
-mono_sgen_add_to_global_remset (gpointer ptr)
+mono_sgen_add_to_global_remset (SgenInternalAllocator *alc, gpointer ptr)
 {
        RememberedSet *rs;
-       gboolean lock;
+       gboolean lock = major_collector.is_parallel;
 
        if (use_cardtable) {
                sgen_card_table_mark_address ((mword)ptr);
@@ -1649,7 +1677,6 @@ mono_sgen_add_to_global_remset (gpointer ptr)
 
        g_assert (!ptr_in_nursery (ptr) && ptr_in_nursery (*(gpointer*)ptr));
 
-       lock = (current_collection_generation == GENERATION_OLD && major_collector.is_parallel);
        if (lock)
                LOCK_GLOBAL_REMSET;
 
@@ -1669,7 +1696,7 @@ mono_sgen_add_to_global_remset (gpointer ptr)
                *(global_remset->store_next++) = (mword)ptr;
                goto done;
        }
-       rs = alloc_remset (global_remset->end_set - global_remset->data, NULL);
+       rs = alloc_global_remset (alc, global_remset->end_set - global_remset->data, NULL);
        rs->next = global_remset;
        global_remset = rs;
        *(global_remset->store_next++) = (mword)ptr;
@@ -1695,8 +1722,8 @@ mono_sgen_add_to_global_remset (gpointer ptr)
  * frequently after each object is copied, to achieve better locality and cache
  * usage.
  */
-static void
-drain_gray_stack (GrayQueue *queue)
+static gboolean
+drain_gray_stack (GrayQueue *queue, int max_objs)
 {
        char *obj;
 
@@ -1704,21 +1731,26 @@ drain_gray_stack (GrayQueue *queue)
                for (;;) {
                        GRAY_OBJECT_DEQUEUE (queue, obj);
                        if (!obj)
-                               break;
+                               return TRUE;
                        DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
                        major_collector.minor_scan_object (obj, queue);
                }
        } else {
+               int i;
+
                if (major_collector.is_parallel && queue == &workers_distribute_gray_queue)
-                       return;
+                       return TRUE;
 
-               for (;;) {
-                       GRAY_OBJECT_DEQUEUE (queue, obj);
-                       if (!obj)
-                               break;
-                       DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
-                       major_collector.major_scan_object (obj, queue);
-               }
+               do {
+                       for (i = 0; i != max_objs; ++i) {
+                               GRAY_OBJECT_DEQUEUE (queue, obj);
+                               if (!obj)
+                                       return TRUE;
+                               DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
+                               major_collector.major_scan_object (obj, queue);
+                       }
+               } while (max_objs < 0);
+               return FALSE;
        }
 }
 
@@ -1844,6 +1876,7 @@ pin_objects_from_addresses (GCMemSection *section, void **start, void **end, voi
                        add_profile_gc_root (&report, definitely_pinned [idx], MONO_PROFILE_GC_ROOT_PINNING, 0);
                notify_gc_roots (&report);
        }
+       stat_pinned_objects += count;
        return count;
 }
 
@@ -2004,7 +2037,7 @@ conservatively_pin_objects_from (void **start, void **end, void *start_nursery,
                                pin_stage_ptr ((void*)addr);
                        if (heap_dump_file)
                                pin_stats_register_address ((char*)addr, pin_type);
-                       DEBUG (6, if (count) fprintf (gc_debug_file, "Pinning address %p\n", (void*)addr));
+                       DEBUG (6, if (count) fprintf (gc_debug_file, "Pinning address %p from %p\n", (void*)addr, start));
                        count++;
                }
                start++;
@@ -2044,7 +2077,7 @@ find_pinning_reference (char *obj, size_t size)
  * conservatively scanned.
  */
 static void
-pin_from_roots (void *start_nursery, void *end_nursery)
+pin_from_roots (void *start_nursery, void *end_nursery, GrayQueue *queue)
 {
        RootRecord *root;
        int i;
@@ -2063,18 +2096,39 @@ pin_from_roots (void *start_nursery, void *end_nursery)
         * *) the _last_ managed stack frame
         * *) pointers slots in managed frames
         */
-       scan_thread_data (start_nursery, end_nursery, FALSE);
+       scan_thread_data (start_nursery, end_nursery, FALSE, queue);
 
        evacuate_pin_staging_area ();
 }
 
-static CopyOrMarkObjectFunc user_copy_or_mark_func;
-static GrayQueue *user_copy_or_mark_queue;
+typedef struct {
+       CopyOrMarkObjectFunc func;
+       GrayQueue *queue;
+} UserCopyOrMarkData;
+
+static pthread_key_t user_copy_or_mark_key;
+
+static void
+init_user_copy_or_mark_key (void)
+{
+       pthread_key_create (&user_copy_or_mark_key, NULL);
+}
+
+static void
+set_user_copy_or_mark_data (UserCopyOrMarkData *data)
+{
+       static pthread_once_t init_control = PTHREAD_ONCE_INIT;
+       pthread_once (&init_control, init_user_copy_or_mark_key);
+
+       pthread_setspecific (user_copy_or_mark_key, data);
+}
 
 static void
 single_arg_user_copy_or_mark (void **obj)
 {
-       user_copy_or_mark_func (obj, user_copy_or_mark_queue);
+       UserCopyOrMarkData *data = pthread_getspecific (user_copy_or_mark_key);
+
+       data->func (obj, data->queue);
 }
 
 /*
@@ -2095,7 +2149,7 @@ precisely_scan_objects_from (CopyOrMarkObjectFunc copy_func, void** start_root,
                        if ((desc & 1) && *start_root) {
                                copy_func (start_root, queue);
                                DEBUG (9, fprintf (gc_debug_file, "Overwrote root at %p with %p\n", start_root, *start_root));
-                               drain_gray_stack (queue);
+                               drain_gray_stack (queue, -1);
                        }
                        desc >>= 1;
                        start_root++;
@@ -2113,7 +2167,7 @@ precisely_scan_objects_from (CopyOrMarkObjectFunc copy_func, void** start_root,
                                if ((bmap & 1) && *objptr) {
                                        copy_func (objptr, queue);
                                        DEBUG (9, fprintf (gc_debug_file, "Overwrote root at %p with %p\n", objptr, *objptr));
-                                       drain_gray_stack (queue);
+                                       drain_gray_stack (queue, -1);
                                }
                                bmap >>= 1;
                                ++objptr;
@@ -2123,12 +2177,11 @@ precisely_scan_objects_from (CopyOrMarkObjectFunc copy_func, void** start_root,
                break;
        }
        case ROOT_DESC_USER: {
+               UserCopyOrMarkData data = { copy_func, queue };
                MonoGCRootMarkFunc marker = user_descriptors [desc >> ROOT_DESC_TYPE_SHIFT];
-               user_copy_or_mark_func = copy_func;
-               user_copy_or_mark_queue = queue;
+               set_user_copy_or_mark_data (&data);
                marker (start_root, single_arg_user_copy_or_mark);
-               user_copy_or_mark_func = NULL;
-               user_copy_or_mark_queue = NULL;
+               set_user_copy_or_mark_data (NULL);
                break;
        }
        case ROOT_DESC_RUN_LEN:
@@ -2176,6 +2229,19 @@ alloc_fragment (void)
        frag->next = NULL;
        return frag;
 }
+static void
+add_fragment (char *start, char *end)
+{
+       Fragment *fragment;
+
+       fragment = alloc_fragment ();
+       fragment->fragment_start = start;
+       fragment->fragment_limit = start;
+       fragment->fragment_end = end;
+       fragment->next = nursery_fragments;
+       nursery_fragments = fragment;
+}
 
 /* size must be a power of 2 */
 void*
@@ -2208,7 +2274,6 @@ alloc_nursery (void)
        GCMemSection *section;
        char *data;
        int scan_starts;
-       Fragment *frag;
        int alloc_size;
 
        if (nursery_section)
@@ -2229,13 +2294,12 @@ alloc_nursery (void)
        data = major_collector.alloc_heap (alloc_size, 0, DEFAULT_NURSERY_BITS);
 #endif
        nursery_start = data;
-       nursery_real_end = nursery_start + nursery_size;
-       mono_sgen_update_heap_boundaries ((mword)nursery_start, (mword)nursery_real_end);
-       nursery_next = nursery_start;
+       nursery_end = nursery_start + nursery_size;
+       mono_sgen_update_heap_boundaries ((mword)nursery_start, (mword)nursery_end);
        DEBUG (4, fprintf (gc_debug_file, "Expanding nursery size (%p-%p): %lu, total: %lu\n", data, data + alloc_size, (unsigned long)nursery_size, (unsigned long)total_alloc));
        section->data = section->next_data = data;
        section->size = alloc_size;
-       section->end_data = nursery_real_end;
+       section->end_data = nursery_end;
        scan_starts = (alloc_size + SCAN_START_SIZE - 1) / SCAN_START_SIZE;
        section->scan_starts = mono_sgen_alloc_internal_dynamic (sizeof (char*) * scan_starts, INTERNAL_MEM_SCAN_STARTS);
        section->num_scan_start = scan_starts;
@@ -2245,12 +2309,7 @@ alloc_nursery (void)
        nursery_section = section;
 
        /* Setup the single first large fragment */
-       frag = alloc_fragment ();
-       frag->fragment_start = nursery_start;
-       frag->fragment_limit = nursery_start;
-       frag->fragment_end = nursery_real_end;
-       nursery_frag_real_end = nursery_real_end;
-       /* FIXME: frag here is lost */
+       add_fragment (nursery_start, nursery_end);
 }
 
 void*
@@ -2265,6 +2324,18 @@ mono_gc_get_nursery (int *shift_bits, size_t *size)
        return nursery_start;
 }
 
+gboolean
+mono_gc_precise_stack_mark_enabled (void)
+{
+       return !conservative_stack_mark;
+}
+
+FILE *
+mono_gc_get_logfile (void)
+{
+       return mono_sgen_get_logfile ();
+}
+
 static void
 report_finalizer_roots_list (FinalizeEntry *list)
 {
@@ -2387,7 +2458,6 @@ static mword fragment_total = 0;
 static void
 add_nursery_frag (size_t frag_size, char* frag_start, char* frag_end)
 {
-       Fragment *fragment;
        DEBUG (4, fprintf (gc_debug_file, "Found empty fragment: %p-%p, size: %zd\n", frag_start, frag_end, frag_size));
        binary_protocol_empty (frag_start, frag_size);
        /* Not worth dealing with smaller fragments: need to tune */
@@ -2396,12 +2466,7 @@ add_nursery_frag (size_t frag_size, char* frag_start, char* frag_end)
                if (nursery_clear_policy == CLEAR_AT_GC)
                        memset (frag_start, 0, frag_size);
 
-               fragment = alloc_fragment ();
-               fragment->fragment_start = frag_start;
-               fragment->fragment_limit = frag_start;
-               fragment->fragment_end = frag_end;
-               fragment->next = nursery_fragments;
-               nursery_fragments = fragment;
+               add_fragment (frag_start, frag_end);
                fragment_total += frag_size;
        } else {
                /* Clear unused fragments, pinning depends on this */
@@ -2440,6 +2505,28 @@ get_finalize_entry_hash_table (int generation)
        }
 }
 
+static MonoObject **finalized_array = NULL;
+static int finalized_array_capacity = 0;
+static int finalized_array_entries = 0;
+
+static void
+bridge_register_finalized_object (MonoObject *object)
+{
+       if (!finalized_array)
+               return;
+
+       if (finalized_array_entries >= finalized_array_capacity) {
+               MonoObject **new_array;
+               g_assert (finalized_array_entries == finalized_array_capacity);
+               finalized_array_capacity *= 2;
+               new_array = mono_sgen_alloc_internal_dynamic (sizeof (MonoObject*) * finalized_array_capacity, INTERNAL_MEM_BRIDGE_DATA);
+               memcpy (new_array, finalized_array, sizeof (MonoObject*) * finalized_array_entries);
+               mono_sgen_free_internal_dynamic (finalized_array, sizeof (MonoObject*) * finalized_array_entries, INTERNAL_MEM_BRIDGE_DATA);
+               finalized_array = new_array;
+       }
+       finalized_array [finalized_array_entries++] = object;
+}
+
 static void
 finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *queue)
 {
@@ -2447,6 +2534,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
        TV_DECLARE (btv);
        int fin_ready;
        int ephemeron_rounds = 0;
+       int num_loops;
        CopyOrMarkObjectFunc copy_func = current_collection_generation == GENERATION_NURSERY ? major_collector.copy_object : major_collector.copy_or_mark_object;
 
        /*
@@ -2462,9 +2550,24 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         *   To achieve better cache locality and cache usage, we drain the gray stack 
         * frequently, after each object is copied, and just finish the work here.
         */
-       drain_gray_stack (queue);
+       drain_gray_stack (queue, -1);
        TV_GETTIME (atv);
        DEBUG (2, fprintf (gc_debug_file, "%s generation done\n", generation_name (generation)));
+
+       /*
+       We must clear weak links that don't track resurrection before processing object ready for
+       finalization so they can be cleared before that.
+       */
+       null_link_in_range (copy_func, start_addr, end_addr, generation, TRUE, queue);
+       if (generation == GENERATION_OLD)
+               null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, TRUE, queue);
+
+       if (finalized_array == NULL && mono_sgen_need_bridge_processing ()) {
+               finalized_array_capacity = 32;
+               finalized_array = mono_sgen_alloc_internal_dynamic (sizeof (MonoObject*) * finalized_array_capacity, INTERNAL_MEM_BRIDGE_DATA);
+       }
+       finalized_array_entries = 0;
+
        /* walk the finalization queue and move also the objects that need to be
         * finalized: use the finalized objects as new roots so the objects they depend
         * on are also not reclaimed. As with the roots above, only objects in the nursery
@@ -2472,6 +2575,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         * We need a loop here, since objects ready for finalizers may reference other objects
         * that are fin-ready. Speedup with a flag?
         */
+       num_loops = 0;
        do {
                /*
                 * Walk the ephemeron tables marking all values with reachable keys. This must be completely done
@@ -2483,20 +2587,29 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
                int done_with_ephemerons = 0;
                do {
                        done_with_ephemerons = mark_ephemerons_in_range (copy_func, start_addr, end_addr, queue);
-                       drain_gray_stack (queue);
+                       drain_gray_stack (queue, -1);
                        ++ephemeron_rounds;
                } while (!done_with_ephemerons);
 
                fin_ready = num_ready_finalizers;
                finalize_in_range (copy_func, start_addr, end_addr, generation, queue);
                if (generation == GENERATION_OLD)
-                       finalize_in_range (copy_func, nursery_start, nursery_real_end, GENERATION_NURSERY, queue);
+                       finalize_in_range (copy_func, nursery_start, nursery_end, GENERATION_NURSERY, queue);
+
+               if (fin_ready != num_ready_finalizers) {
+                       ++num_loops;
+                       if (finalized_array != NULL)
+                               mono_sgen_bridge_processing (finalized_array_entries, finalized_array);
+               }
 
                /* drain the new stack that might have been created */
                DEBUG (6, fprintf (gc_debug_file, "Precise scan of gray area post fin\n"));
-               drain_gray_stack (queue);
+               drain_gray_stack (queue, -1);
        } while (fin_ready != num_ready_finalizers);
 
+       if (mono_sgen_need_bridge_processing ())
+               g_assert (num_loops <= 1);
+
        /*
         * Clear ephemeron pairs with unreachable keys.
         * We pass the copy func so we can figure out if an array was promoted or not.
@@ -2516,12 +2629,12 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
         */
        g_assert (gray_object_queue_is_empty (queue));
        for (;;) {
-               null_link_in_range (copy_func, start_addr, end_addr, generation, queue);
+               null_link_in_range (copy_func, start_addr, end_addr, generation, FALSE, queue);
                if (generation == GENERATION_OLD)
-                       null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, queue);
+                       null_link_in_range (copy_func, start_addr, end_addr, GENERATION_NURSERY, FALSE, queue);
                if (gray_object_queue_is_empty (queue))
                        break;
-               drain_gray_stack (queue);
+               drain_gray_stack (queue, -1);
        }
 
        g_assert (gray_object_queue_is_empty (queue));
@@ -2579,7 +2692,7 @@ build_nursery_fragments (void **start, int num_entries)
                frag_start = (char*)start [i] + frag_size;
        }
        nursery_last_pinned_end = frag_start;
-       frag_end = nursery_real_end;
+       frag_end = nursery_end;
        frag_size = frag_end - frag_start;
        if (frag_size)
                add_nursery_frag (frag_size, frag_start, frag_end);
@@ -2783,6 +2896,7 @@ init_stats (void)
        mono_counters_register ("Major sweep", MONO_COUNTER_GC | MONO_COUNTER_LONG, &time_major_sweep);
        mono_counters_register ("Major fragment creation", MONO_COUNTER_GC | MONO_COUNTER_LONG, &time_major_fragment_creation);
 
+       mono_counters_register ("Number of pinned objects", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_pinned_objects);
 
 #ifdef HEAVY_STATISTICS
        mono_counters_register ("WBarrier set field", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_set_field);
@@ -2906,6 +3020,60 @@ mono_sgen_need_major_collection (mword space_needed)
        return need_major_collection (space_needed);
 }
 
+static GrayQueue*
+job_gray_queue (WorkerData *worker_data)
+{
+       return worker_data ? &worker_data->private_gray_queue : WORKERS_DISTRIBUTE_GRAY_QUEUE;
+}
+
+typedef struct
+{
+       char *heap_start;
+       char *heap_end;
+} ScanFromRemsetsJobData;
+
+static void
+job_scan_from_remsets (WorkerData *worker_data, void *job_data_untyped)
+{
+       ScanFromRemsetsJobData *job_data = job_data_untyped;
+
+       scan_from_remsets (job_data->heap_start, job_data->heap_end, job_gray_queue (worker_data));
+}
+
+typedef struct
+{
+       CopyOrMarkObjectFunc func;
+       char *heap_start;
+       char *heap_end;
+       int root_type;
+} ScanFromRegisteredRootsJobData;
+
+static void
+job_scan_from_registered_roots (WorkerData *worker_data, void *job_data_untyped)
+{
+       ScanFromRegisteredRootsJobData *job_data = job_data_untyped;
+
+       scan_from_registered_roots (job_data->func,
+                       job_data->heap_start, job_data->heap_end,
+                       job_data->root_type,
+                       job_gray_queue (worker_data));
+}
+
+typedef struct
+{
+       char *heap_start;
+       char *heap_end;
+} ScanThreadDataJobData;
+
+static void
+job_scan_thread_data (WorkerData *worker_data, void *job_data_untyped)
+{
+       ScanThreadDataJobData *job_data = job_data_untyped;
+
+       scan_thread_data (job_data->heap_start, job_data->heap_end, TRUE,
+                       job_gray_queue (worker_data));
+}
+
 /*
  * Collect objects in the nursery.  Returns whether to trigger a major
  * collection.
@@ -2916,11 +3084,17 @@ collect_nursery (size_t requested_size)
        gboolean needs_major;
        size_t max_garbage_amount;
        char *orig_nursery_next;
+       ScanFromRemsetsJobData sfrjd;
+       ScanFromRegisteredRootsJobData scrrjd_normal, scrrjd_wbarrier;
+       ScanThreadDataJobData stdjd;
        TV_DECLARE (all_atv);
        TV_DECLARE (all_btv);
        TV_DECLARE (atv);
        TV_DECLARE (btv);
 
+       if (disable_minor_collections)
+               return TRUE;
+
        mono_perfcounters->gc_collections0++;
 
        current_collection_generation = GENERATION_NURSERY;
@@ -2933,7 +3107,7 @@ collect_nursery (size_t requested_size)
        orig_nursery_next = nursery_next;
        nursery_next = MAX (nursery_next, nursery_last_pinned_end);
        /* FIXME: optimize later to use the higher address where an object can be present */
-       nursery_next = MAX (nursery_next, nursery_real_end);
+       nursery_next = MAX (nursery_next, nursery_end);
 
        DEBUG (1, fprintf (gc_debug_file, "Start nursery collection %d %p-%p, size: %d\n", num_minor_gcs, nursery_start, nursery_next, (int)(nursery_next - nursery_start)));
        max_garbage_amount = nursery_next - nursery_start;
@@ -2959,6 +3133,7 @@ collect_nursery (size_t requested_size)
        try_calculate_minor_collection_allowance (FALSE);
 
        gray_object_queue_init (&gray_queue, mono_sgen_get_unmanaged_allocator ());
+       workers_init_distribute_gray_queue ();
 
        num_minor_gcs++;
        mono_stats.minor_gc_count ++;
@@ -2968,10 +3143,10 @@ collect_nursery (size_t requested_size)
        /* pin from pinned handles */
        init_pinning ();
        mono_profiler_gc_event (MONO_GC_EVENT_MARK_START, 0);
-       pin_from_roots (nursery_start, nursery_next);
+       pin_from_roots (nursery_start, nursery_next, WORKERS_DISTRIBUTE_GRAY_QUEUE);
        /* identify pinned objects */
        optimize_pin_queue (0);
-       next_pin_slot = pin_objects_from_addresses (nursery_section, pin_queue, pin_queue + next_pin_slot, nursery_start, nursery_next, &gray_queue);
+       next_pin_slot = pin_objects_from_addresses (nursery_section, pin_queue, pin_queue + next_pin_slot, nursery_start, nursery_next, WORKERS_DISTRIBUTE_GRAY_QUEUE);
        nursery_section->pin_queue_start = pin_queue;
        nursery_section->pin_queue_num_entries = next_pin_slot;
        TV_GETTIME (atv);
@@ -2982,12 +3157,23 @@ collect_nursery (size_t requested_size)
        if (consistency_check_at_minor_collection)
                check_consistency ();
 
-       /* 
-        * walk all the roots and copy the young objects to the old generation,
-        * starting from to_space
+       workers_start_all_workers ();
+
+       /*
+        * Walk all the roots and copy the young objects to the old
+        * generation, starting from to_space.
+        *
+        * The global remsets must be processed before the workers start
+        * marking because they might add global remsets.
         */
+       scan_from_global_remsets (nursery_start, nursery_next, WORKERS_DISTRIBUTE_GRAY_QUEUE);
+
+       workers_start_marking ();
+
+       sfrjd.heap_start = nursery_start;
+       sfrjd.heap_end = nursery_next;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_remsets, &sfrjd);
 
-       scan_from_remsets (nursery_start, nursery_next, &gray_queue);
        /* we don't have complete write barrier yet, so we scan all the old generation sections */
        TV_GETTIME (btv);
        time_minor_scan_remsets += TV_ELAPSED_MS (atv, btv);
@@ -2996,12 +3182,13 @@ collect_nursery (size_t requested_size)
        if (use_cardtable) {
                atv = btv;
                card_tables_collect_stats (TRUE);
-               scan_from_card_tables (nursery_start, nursery_next, &gray_queue);
+               scan_from_card_tables (nursery_start, nursery_next, WORKERS_DISTRIBUTE_GRAY_QUEUE);
                TV_GETTIME (btv);
                time_minor_scan_card_table += TV_ELAPSED_MS (atv, btv);
        }
 
-       drain_gray_stack (&gray_queue);
+       if (!major_collector.is_parallel)
+               drain_gray_stack (&gray_queue, -1);
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
                report_registered_roots ();
@@ -3009,22 +3196,57 @@ collect_nursery (size_t requested_size)
                report_finalizer_roots ();
        TV_GETTIME (atv);
        time_minor_scan_pinned += TV_ELAPSED_MS (btv, atv);
+
        /* registered roots, this includes static fields */
-       scan_from_registered_roots (major_collector.copy_object, nursery_start, nursery_next, ROOT_TYPE_NORMAL, &gray_queue);
-       scan_from_registered_roots (major_collector.copy_object, nursery_start, nursery_next, ROOT_TYPE_WBARRIER, &gray_queue);
+       scrrjd_normal.func = major_collector.copy_object;
+       scrrjd_normal.heap_start = nursery_start;
+       scrrjd_normal.heap_end = nursery_next;
+       scrrjd_normal.root_type = ROOT_TYPE_NORMAL;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_normal);
+
+       scrrjd_wbarrier.func = major_collector.copy_object;
+       scrrjd_wbarrier.heap_start = nursery_start;
+       scrrjd_wbarrier.heap_end = nursery_next;
+       scrrjd_wbarrier.root_type = ROOT_TYPE_WBARRIER;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_wbarrier);
+
        TV_GETTIME (btv);
        time_minor_scan_registered_roots += TV_ELAPSED_MS (atv, btv);
+
        /* thread data */
-       scan_thread_data (nursery_start, nursery_next, TRUE);
+       stdjd.heap_start = nursery_start;
+       stdjd.heap_end = nursery_next;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_thread_data, &stdjd);
+
        TV_GETTIME (atv);
        time_minor_scan_thread_data += TV_ELAPSED_MS (btv, atv);
        btv = atv;
 
+       if (major_collector.is_parallel) {
+               while (!gray_object_queue_is_empty (WORKERS_DISTRIBUTE_GRAY_QUEUE)) {
+                       workers_distribute_gray_queue_sections ();
+                       usleep (1000);
+               }
+       }
+       workers_join ();
+
+       if (major_collector.is_parallel)
+               g_assert (gray_object_queue_is_empty (&gray_queue));
+
        finish_gray_stack (nursery_start, nursery_next, GENERATION_NURSERY, &gray_queue);
        TV_GETTIME (atv);
        time_minor_finish_gray_stack += TV_ELAPSED_MS (btv, atv);
        mono_profiler_gc_event (MONO_GC_EVENT_MARK_END, 0);
 
+       /*
+        * The (single-threaded) finalization code might have done
+        * some copying/marking so we can only reset the GC thread's
+        * worker data here instead of earlier when we joined the
+        * workers.
+        */
+       if (major_collector.reset_worker_data)
+               major_collector.reset_worker_data (workers_gc_thread_data.major_collector_data);
+
        if (objects_pinned) {
                evacuate_pin_staging_area ();
                optimize_pin_queue (0);
@@ -3080,6 +3302,21 @@ collect_nursery (size_t requested_size)
        return needs_major;
 }
 
+typedef struct
+{
+       FinalizeEntry *list;
+} ScanFinalizerEntriesJobData;
+
+static void
+job_scan_finalizer_entries (WorkerData *worker_data, void *job_data_untyped)
+{
+       ScanFinalizerEntriesJobData *job_data = job_data_untyped;
+
+       scan_finalizer_entries (major_collector.copy_or_mark_object,
+                       job_data->list,
+                       job_gray_queue (worker_data));
+}
+
 static void
 major_do_collection (const char *reason)
 {
@@ -3094,6 +3331,9 @@ major_do_collection (const char *reason)
        char *heap_start = NULL;
        char *heap_end = (char*)-1;
        int old_next_pin_slot;
+       ScanFromRegisteredRootsJobData scrrjd_normal, scrrjd_wbarrier;
+       ScanThreadDataJobData stdjd;
+       ScanFinalizerEntriesJobData sfejd_fin_ready, sfejd_critical_fin;
 
        mono_perfcounters->gc_collections1++;
 
@@ -3113,8 +3353,7 @@ major_do_collection (const char *reason)
        binary_protocol_collection (GENERATION_OLD);
        check_scan_starts ();
        gray_object_queue_init (&gray_queue, mono_sgen_get_unmanaged_allocator ());
-       if (major_collector.is_parallel)
-               gray_object_queue_init (&workers_distribute_gray_queue, mono_sgen_get_unmanaged_allocator ());
+       workers_init_distribute_gray_queue ();
 
        degraded_mode = 0;
        DEBUG (1, fprintf (gc_debug_file, "Start major collection %d\n", num_major_gcs));
@@ -3131,7 +3370,7 @@ major_do_collection (const char *reason)
        TV_GETTIME (btv);
        time_major_pre_collection_fragment_clear += TV_ELAPSED_MS (atv, btv);
 
-       nursery_section->next_data = nursery_real_end;
+       nursery_section->next_data = nursery_end;
        /* we should also coalesce scanning from sections close to each other
         * and deal with pointers outside of the sections later.
         */
@@ -3154,7 +3393,7 @@ major_do_collection (const char *reason)
        TV_GETTIME (atv);
        init_pinning ();
        DEBUG (6, fprintf (gc_debug_file, "Collecting pinned addresses\n"));
-       pin_from_roots ((void*)lowest_heap_address, (void*)highest_heap_address);
+       pin_from_roots ((void*)lowest_heap_address, (void*)highest_heap_address, WORKERS_DISTRIBUTE_GRAY_QUEUE);
        optimize_pin_queue (0);
 
        /*
@@ -3198,7 +3437,12 @@ major_do_collection (const char *reason)
 
        major_collector.init_to_space ();
 
-       workers_start_all_workers (1);
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+       main_gc_thread = pthread_self ();
+#endif
+
+       workers_start_all_workers ();
+       workers_start_marking ();
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
                report_registered_roots ();
@@ -3206,15 +3450,26 @@ major_do_collection (const char *reason)
        time_major_scan_pinned += TV_ELAPSED_MS (btv, atv);
 
        /* registered roots, this includes static fields */
-       scan_from_registered_roots (major_collector.copy_or_mark_object, heap_start, heap_end, ROOT_TYPE_NORMAL, WORKERS_DISTRIBUTE_GRAY_QUEUE);
-       scan_from_registered_roots (major_collector.copy_or_mark_object, heap_start, heap_end, ROOT_TYPE_WBARRIER, WORKERS_DISTRIBUTE_GRAY_QUEUE);
+       scrrjd_normal.func = major_collector.copy_or_mark_object;
+       scrrjd_normal.heap_start = heap_start;
+       scrrjd_normal.heap_end = heap_end;
+       scrrjd_normal.root_type = ROOT_TYPE_NORMAL;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_normal);
+
+       scrrjd_wbarrier.func = major_collector.copy_or_mark_object;
+       scrrjd_wbarrier.heap_start = heap_start;
+       scrrjd_wbarrier.heap_end = heap_end;
+       scrrjd_wbarrier.root_type = ROOT_TYPE_WBARRIER;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_wbarrier);
+
        TV_GETTIME (btv);
        time_major_scan_registered_roots += TV_ELAPSED_MS (atv, btv);
 
        /* Threads */
-       /* FIXME: This is the wrong place for this, because it does
-          pinning */
-       scan_thread_data (heap_start, heap_end, TRUE);
+       stdjd.heap_start = heap_start;
+       stdjd.heap_end = heap_end;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_thread_data, &stdjd);
+
        TV_GETTIME (atv);
        time_major_scan_thread_data += TV_ELAPSED_MS (btv, atv);
 
@@ -3223,9 +3478,14 @@ major_do_collection (const char *reason)
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
                report_finalizer_roots ();
+
        /* scan the list of objects ready for finalization */
-       scan_finalizer_entries (major_collector.copy_or_mark_object, fin_ready_list, WORKERS_DISTRIBUTE_GRAY_QUEUE);
-       scan_finalizer_entries (major_collector.copy_or_mark_object, critical_fin_list, WORKERS_DISTRIBUTE_GRAY_QUEUE);
+       sfejd_fin_ready.list = fin_ready_list;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_finalizer_entries, &sfejd_fin_ready);
+
+       sfejd_critical_fin.list = critical_fin_list;
+       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_finalizer_entries, &sfejd_critical_fin);
+
        TV_GETTIME (atv);
        time_major_scan_finalized += TV_ELAPSED_MS (btv, atv);
        DEBUG (2, fprintf (gc_debug_file, "Root scan: %d usecs\n", TV_ELAPSED (btv, atv)));
@@ -3236,12 +3496,15 @@ major_do_collection (const char *reason)
        if (major_collector.is_parallel) {
                while (!gray_object_queue_is_empty (WORKERS_DISTRIBUTE_GRAY_QUEUE)) {
                        workers_distribute_gray_queue_sections ();
-                       usleep (2000);
+                       usleep (1000);
                }
        }
-       workers_change_num_working (-1);
        workers_join ();
 
+#ifdef SGEN_DEBUG_INTERNAL_ALLOC
+       main_gc_thread = NULL;
+#endif
+
        if (major_collector.is_parallel)
                g_assert (gray_object_queue_is_empty (&gray_queue));
 
@@ -3250,6 +3513,15 @@ major_do_collection (const char *reason)
        TV_GETTIME (atv);
        time_major_finish_gray_stack += TV_ELAPSED_MS (btv, atv);
 
+       /*
+        * The (single-threaded) finalization code might have done
+        * some copying/marking so we can only reset the GC thread's
+        * worker data here instead of earlier when we joined the
+        * workers.
+        */
+       if (major_collector.reset_worker_data)
+               major_collector.reset_worker_data (workers_gc_thread_data.major_collector_data);
+
        if (objects_pinned) {
                /*This is slow, but we just OOM'd*/
                mono_sgen_pin_queue_clear_discarded_entries (nursery_section, old_next_pin_slot);
@@ -3260,7 +3532,7 @@ major_do_collection (const char *reason)
        }
 
        reset_heap_boundaries ();
-       mono_sgen_update_heap_boundaries ((mword)nursery_start, (mword)nursery_real_end);
+       mono_sgen_update_heap_boundaries ((mword)nursery_start, (mword)nursery_end);
 
        /* sweep the big objects list */
        prevbo = NULL;
@@ -3339,7 +3611,7 @@ major_do_collection (const char *reason)
 static void
 major_collection (const char *reason)
 {
-       if (g_getenv ("MONO_GC_NO_MAJOR")) {
+       if (disable_major_collections) {
                collect_nursery (0);
                return;
        }
@@ -3383,7 +3655,7 @@ minor_collect_or_expand_inner (size_t size)
                DEBUG (2, fprintf (gc_debug_file, "Heap size: %lu, LOS size: %lu\n", (unsigned long)total_alloc, (unsigned long)los_memory_usage));
                restart_world (0);
                /* this also sets the proper pointers for the next allocation */
-               if (!search_fragment_for_size (size)) {
+               if (!alloc_fragment_for_size (size)) {
                        int i;
                        /* TypeBuilder and MonoMethod are killing mcs with fragmentation */
                        DEBUG (1, fprintf (gc_debug_file, "nursery collection didn't find enough room for %zd alloc (%d pinned)\n", size, last_num_pinned));
@@ -3479,12 +3751,13 @@ setup_fragment (Fragment *frag, Fragment *prev, size_t size)
        fragment_freelist = frag;
 }
 
-/* check if we have a suitable fragment in nursery_fragments to be able to allocate
- * an object of size @size
- * Return FALSE if not found (which means we need a collection)
+/*
+ * Allocate a new nursery fragment able to hold an object of size @size.
+ * nursery_next and nursery_frag_real_end are set to the boundaries of the fragment.
+ * Return TRUE if found, FALSE otherwise.
  */
 static gboolean
-search_fragment_for_size (size_t size)
+alloc_fragment_for_size (size_t size)
 {
        Fragment *frag, *prev;
        DEBUG (4, fprintf (gc_debug_file, "Searching nursery fragment %p, size: %zd\n", nursery_frag_real_end, size));
@@ -3506,11 +3779,11 @@ search_fragment_for_size (size_t size)
 }
 
 /*
- * Same as search_fragment_for_size but if search for @desired_size fails, try to satisfy @minimum_size.
+ * Same as alloc_fragment_for_size but if search for @desired_size fails, try to satisfy @minimum_size.
  * This improves nursery usage.
  */
 static int
-search_fragment_for_size_range (size_t desired_size, size_t minimum_size)
+alloc_fragment_for_size_range (size_t desired_size, size_t minimum_size)
 {
        Fragment *frag, *prev, *min_prev;
        DEBUG (4, fprintf (gc_debug_file, "Searching nursery fragment %p, desired size: %zd minimum size %zd\n", nursery_frag_real_end, desired_size, minimum_size));
@@ -3603,7 +3876,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                        collect_nursery (0);
                        restart_world (0);
                        mono_profiler_gc_event (MONO_GC_EVENT_END, 0);
-                       if (!degraded_mode && !search_fragment_for_size (size) && size <= MAX_SMALL_OBJ_SIZE) {
+                       if (!degraded_mode && !alloc_fragment_for_size (size) && size <= MAX_SMALL_OBJ_SIZE) {
                                // FIXME:
                                g_assert_not_reached ();
                        }
@@ -3683,7 +3956,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                        if (size > tlab_size) {
                                /* Allocate directly from the nursery */
                                if (nursery_next + size >= nursery_frag_real_end) {
-                                       if (!search_fragment_for_size (size)) {
+                                       if (!alloc_fragment_for_size (size)) {
                                                minor_collect_or_expand_inner (size);
                                                if (degraded_mode) {
                                                        p = alloc_degraded (vtable, size);
@@ -3713,7 +3986,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                                        if (available_in_nursery > MAX_NURSERY_TLAB_WASTE && available_in_nursery > size) {
                                                alloc_size = available_in_nursery;
                                        } else {
-                                               alloc_size = search_fragment_for_size_range (tlab_size, size);
+                                               alloc_size = alloc_fragment_for_size_range (tlab_size, size);
                                                if (!alloc_size) {
                                                        alloc_size = tlab_size;
                                                        minor_collect_or_expand_inner (tlab_size);
@@ -3930,7 +4203,7 @@ mono_gc_alloc_pinned_obj (MonoVTable *vtable, size_t size)
                p = mono_sgen_los_alloc_large_inner (vtable, size);
        } else {
                DEBUG (9, g_assert (vtable->klass->inited));
-               p = major_collector.alloc_small_pinned_obj (size, vtable->klass->has_references);
+               p = major_collector.alloc_small_pinned_obj (size, SGEN_VTABLE_HAS_REFERENCES (vtable));
        }
        if (G_LIKELY (p)) {
                DEBUG (6, fprintf (gc_debug_file, "Allocated pinned object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
@@ -3950,6 +4223,9 @@ mono_gc_alloc_mature (MonoVTable *vtable)
        res = alloc_degraded (vtable, size);
        *res = vtable;
        UNLOCK_GC;
+       if (G_UNLIKELY (vtable->klass->has_finalize))
+               mono_object_register_finalizer ((MonoObject*)res);
+
        return res;
 }
 
@@ -4057,6 +4333,7 @@ finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int g
                                        num_ready_finalizers++;
                                        hash_table->num_registered--;
                                        queue_finalization_entry (entry);
+                                       bridge_register_finalized_object ((MonoObject*)copy);
                                        /* Make it survive */
                                        from = entry->object;
                                        entry->object = copy;
@@ -4111,6 +4388,16 @@ object_is_reachable (char *object, char *start, char *end)
        return !object_is_fin_ready (object) || major_collector.is_object_live (object);
 }
 
+gboolean
+mono_sgen_object_is_live (void *obj)
+{
+       if (ptr_in_nursery (obj))
+               return object_is_pinned (obj);
+       if (current_collection_generation == GENERATION_NURSERY)
+               return FALSE;
+       return major_collector.is_object_live (obj);
+}
+
 /* LOCKING: requires that the GC lock is held */
 static void
 null_ephemerons_for_domain (MonoDomain *domain)
@@ -4199,11 +4486,11 @@ clear_unreachable_ephemerons (CopyOrMarkObjectFunc copy_func, char *start, char
                        if (was_promoted) {
                                if (ptr_in_nursery (key)) {/*key was not promoted*/
                                        DEBUG (5, fprintf (gc_debug_file, "\tAdded remset to key %p\n", key));
-                                       mono_sgen_add_to_global_remset (&cur->key);
+                                       mono_sgen_add_to_global_remset (queue->allocator, &cur->key);
                                }
                                if (ptr_in_nursery (cur->value)) {/*value was not promoted*/
                                        DEBUG (5, fprintf (gc_debug_file, "\tAdded remset to value %p\n", cur->value));
-                                       mono_sgen_add_to_global_remset (&cur->value);
+                                       mono_sgen_add_to_global_remset (queue->allocator, &cur->value);
                                }
                        }
                }
@@ -4272,7 +4559,7 @@ mark_ephemerons_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end
 
 /* LOCKING: requires that the GC lock is held */
 static void
-null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue)
+null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue)
 {
        DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
        DisappearingLink **disappearing_link_hash = hash->table;
@@ -4284,10 +4571,28 @@ null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int
        for (i = 0; i < disappearing_link_hash_size; ++i) {
                prev = NULL;
                for (entry = disappearing_link_hash [i]; entry;) {
-                       char *object = DISLINK_OBJECT (entry);
+                       char *object;
+                       gboolean track = DISLINK_TRACK (entry);
+
+                       /*
+                        * Tracked references are processed after
+                        * finalization handling whereas standard weak
+                        * references are processed before.  If an
+                        * object is still not marked after finalization
+                        * handling it means that it either doesn't have
+                        * a finalizer or the finalizer has already run,
+                        * so we must null a tracking reference.
+                        */
+                       if (track == before_finalization) {
+                               prev = entry;
+                               entry = entry->next;
+                               continue;
+                       }
+
+                       object = DISLINK_OBJECT (entry);
+
                        if (object >= start && object < end && !major_collector.is_object_live (object)) {
-                               gboolean track = DISLINK_TRACK (entry);
-                               if (!track && object_is_fin_ready (object)) {
+                               if (object_is_fin_ready (object)) {
                                        void **p = entry->link;
                                        DisappearingLink *old;
                                        *p = NULL;
@@ -4334,14 +4639,7 @@ null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int
 
                                                continue;
                                        } else {
-                                               /* We set the track resurrection bit to
-                                                * FALSE if the object is to be finalized
-                                                * so that the object can be collected in
-                                                * the next cycle (i.e. after it was
-                                                * finalized).
-                                                */
-                                               *entry->link = HIDE_POINTER (copy,
-                                                       object_is_fin_ready (object) ? FALSE : track);
+                                               *entry->link = HIDE_POINTER (copy, track);
                                                DEBUG (5, fprintf (gc_debug_file, "Updated dislink at %p to %p\n", entry->link, DISLINK_OBJECT (entry)));
                                        }
                                }
@@ -4839,11 +5137,6 @@ mono_gc_deregister_root (char* addr)
  * ######################################################################
  */
 
-/* FIXME: handle large/small config */
-#define HASH_PTHREAD_T(id) (((unsigned int)(id) >> 4) * 2654435761u)
-
-static SgenThreadInfo* thread_table [THREAD_HASH_SIZE];
-
 #if USE_SIGNAL_BASED_START_STOP_WORLD
 
 static MonoSemType suspend_ack_semaphore;
@@ -4851,38 +5144,31 @@ static MonoSemType *suspend_ack_semaphore_ptr;
 static unsigned int global_stop_count = 0;
 
 static sigset_t suspend_signal_mask;
-static mword cur_thread_regs [ARCH_NUM_REGS] = {0};
 
-/* LOCKING: assumes the GC lock is held */
-SgenThreadInfo**
-mono_sgen_get_thread_table (void)
-{
-       return thread_table;
-}
-
-SgenThreadInfo*
-mono_sgen_thread_info_lookup (ARCH_THREAD_TYPE id)
-{
-       unsigned int hash = HASH_PTHREAD_T (id) % THREAD_HASH_SIZE;
-       SgenThreadInfo *info;
-
-       info = thread_table [hash];
-       while (info && !ARCH_THREAD_EQUALS (info->id, id)) {
-               info = info->next;
-       }
-       return info;
-}
+#ifdef USE_MONO_CTX
+static MonoContext cur_thread_ctx = {0};
+#else
+static mword cur_thread_regs [ARCH_NUM_REGS] = {0};
+#endif
 
 static void
 update_current_thread_stack (void *start)
 {
+       int stack_guard = 0;
+#ifndef USE_MONO_CTX
        void *ptr = cur_thread_regs;
-       SgenThreadInfo *info = mono_sgen_thread_info_lookup (ARCH_GET_THREAD ());
+#endif
+       SgenThreadInfo *info = mono_thread_info_current ();
        
-       info->stack_start = align_pointer (&ptr);
+       info->stack_start = align_pointer (&stack_guard);
        g_assert (info->stack_start >= info->stack_start_limit && info->stack_start < info->stack_end);
+#ifdef USE_MONO_CTX
+       MONO_CONTEXT_GET_CURRENT (cur_thread_ctx);
+       info->monoctx = &cur_thread_ctx;
+#else
        ARCH_STORE_REGS (ptr);
        info->stopped_regs = ptr;
+#endif
        if (gc_callbacks.thread_suspend_func)
                gc_callbacks.thread_suspend_func (info->runtime_data, NULL);
 }
@@ -4908,6 +5194,9 @@ is_ip_in_managed_allocator (MonoDomain *domain, gpointer ip);
 void
 mono_sgen_wait_for_suspend_ack (int count)
 {
+#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
+               /* mach thread_resume is synchronous so we dont need to wait for them */
+#else
        int i, result;
 
        for (i = 0; i < count; ++i) {
@@ -4917,57 +5206,49 @@ mono_sgen_wait_for_suspend_ack (int count)
                        }
                }
        }
+#endif
 }
 
 static int
 restart_threads_until_none_in_managed_allocator (void)
 {
        SgenThreadInfo *info;
-       int i, result, num_threads_died = 0;
+       int num_threads_died = 0;
        int sleep_duration = -1;
 
        for (;;) {
                int restart_count = 0, restarted_count = 0;
                /* restart all threads that stopped in the
                   allocator */
-               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-                       for (info = thread_table [i]; info; info = info->next) {
-                               if (info->skip)
-                                       continue;
-                               if (!info->stack_start || info->in_critical_region ||
-                                               is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
-                                       binary_protocol_thread_restart ((gpointer)info->id);
-#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-                                       result = thread_resume (pthread_mach_thread_np (info->id));
-#else
-                                       result = pthread_kill (info->id, restart_signal_num);
-#endif
-                                       if (result == 0) {
-                                               ++restart_count;
-                                       } else {
-                                               info->skip = 1;
-                                       }
+               FOREACH_THREAD_SAFE (info) {
+                       gboolean result;
+                       if (info->skip)
+                               continue;
+                       if (!info->stack_start || info->in_critical_region ||
+                                       is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
+                               binary_protocol_thread_restart ((gpointer)mono_thread_info_get_tid (info));
+                               result = mono_sgen_resume_thread (info);
+                               if (result) {
+                                       ++restart_count;
                                } else {
-                                       /* we set the stopped_ip to
-                                          NULL for threads which
-                                          we're not restarting so
-                                          that we can easily identify
-                                          the others */
-                                       info->stopped_ip = NULL;
-                                       info->stopped_domain = NULL;
+                                       info->skip = 1;
                                }
+                       } else {
+                               /* we set the stopped_ip to
+                                  NULL for threads which
+                                  we're not restarting so
+                                  that we can easily identify
+                                  the others */
+                               info->stopped_ip = NULL;
+                               info->stopped_domain = NULL;
                        }
-               }
+               } END_FOREACH_THREAD_SAFE
                /* if no threads were restarted, we're done */
                if (restart_count == 0)
                        break;
 
-#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-               /* mach thread_resume is synchronous so we dont need to wait for them */
-#else
                /* wait for the threads to signal their restart */
                mono_sgen_wait_for_suspend_ack (restart_count);
-#endif
 
                if (sleep_duration < 0) {
                        sched_yield ();
@@ -4978,31 +5259,23 @@ restart_threads_until_none_in_managed_allocator (void)
                }
 
                /* stop them again */
-               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-                       for (info = thread_table [i]; info; info = info->next) {
-                               if (info->skip || info->stopped_ip == NULL)
-                                       continue;
-#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-                               result = thread_suspend (pthread_mach_thread_np (info->id));
-#else
-                               result = pthread_kill (info->id, suspend_signal_num);
-#endif
-                               if (result == 0) {
-                                       ++restarted_count;
-                               } else {
-                                       info->skip = 1;
-                               }
+               FOREACH_THREAD (info) {
+                       gboolean result;
+                       if (info->skip || info->stopped_ip == NULL)
+                               continue;
+                       result = mono_sgen_suspend_thread (info);
+
+                       if (result) {
+                               ++restarted_count;
+                       } else {
+                               info->skip = 1;
                        }
-               }
+               } END_FOREACH_THREAD
                /* some threads might have died */
                num_threads_died += restart_count - restarted_count;
-#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
-               /* mach thread_resume is synchronous so we dont need to wait for them */
-#else
                /* wait for the threads to signal their suspension
                   again */
                mono_sgen_wait_for_suspend_ack (restart_count);
-#endif
        }
 
        return num_threads_died;
@@ -5013,14 +5286,16 @@ static void
 suspend_handler (int sig, siginfo_t *siginfo, void *context)
 {
        SgenThreadInfo *info;
-       pthread_t id;
        int stop_count;
        int old_errno = errno;
+#ifdef USE_MONO_CTX
+       MonoContext monoctx;
+#else
        gpointer regs [ARCH_NUM_REGS];
+#endif
        gpointer stack_start;
 
-       id = pthread_self ();
-       info = mono_sgen_thread_info_lookup (id);
+       info = mono_thread_info_current ();
        info->stopped_domain = mono_domain_get ();
        info->stopped_ip = (gpointer) ARCH_SIGCTX_IP (context);
        stop_count = global_stop_count;
@@ -5039,8 +5314,13 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        if (stack_start >= info->stack_start_limit && info->stack_start <= info->stack_end) {
                info->stack_start = stack_start;
 
+#ifdef USE_MONO_CTX
+               mono_sigctx_to_monoctx (context, &monoctx);
+               info->monoctx = &monoctx;
+#else
                ARCH_COPY_SIGCTX_REGS (regs, context);
                info->stopped_regs = regs;
+#endif
        } else {
                g_assert (!info->stack_start);
        }
@@ -5049,7 +5329,7 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        if (gc_callbacks.thread_suspend_func)
                gc_callbacks.thread_suspend_func (info->runtime_data, context);
 
-       DEBUG (4, fprintf (gc_debug_file, "Posting suspend_ack_semaphore for suspend from %p %p\n", info, (gpointer)ARCH_GET_THREAD ()));
+       DEBUG (4, fprintf (gc_debug_file, "Posting suspend_ack_semaphore for suspend from %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
        /* notify the waiting thread */
        MONO_SEM_POST (suspend_ack_semaphore_ptr);
        info->stop_count = stop_count;
@@ -5060,7 +5340,7 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
                sigsuspend (&suspend_signal_mask);
        } while (info->signal != restart_signal_num);
 
-       DEBUG (4, fprintf (gc_debug_file, "Posting suspend_ack_semaphore for resume from %p %p\n", info, (gpointer)ARCH_GET_THREAD ()));
+       DEBUG (4, fprintf (gc_debug_file, "Posting suspend_ack_semaphore for resume from %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
        /* notify the waiting thread */
        MONO_SEM_POST (suspend_ack_semaphore_ptr);
 
@@ -5073,9 +5353,9 @@ restart_handler (int sig)
        SgenThreadInfo *info;
        int old_errno = errno;
 
-       info = mono_sgen_thread_info_lookup (pthread_self ());
+       info = mono_thread_info_current ();
        info->signal = restart_signal_num;
-       DEBUG (4, fprintf (gc_debug_file, "Restart handler in %p %p\n", info, (gpointer)ARCH_GET_THREAD ()));
+       DEBUG (4, fprintf (gc_debug_file, "Restart handler in %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
 
        errno = old_errno;
 }
@@ -5107,7 +5387,7 @@ stop_world (int generation)
        update_current_thread_stack (&count);
 
        global_stop_count++;
-       DEBUG (3, fprintf (gc_debug_file, "stopping world n %d from %p %p\n", global_stop_count, mono_sgen_thread_info_lookup (ARCH_GET_THREAD ()), (gpointer)ARCH_GET_THREAD ()));
+       DEBUG (3, fprintf (gc_debug_file, "stopping world n %d from %p %p\n", global_stop_count, mono_thread_info_current (), (gpointer)mono_native_thread_id_get ()));
        TV_GETTIME (stop_world_time);
        count = mono_sgen_thread_handshake (suspend_signal_num);
        count -= restart_threads_until_none_in_managed_allocator ();
@@ -5121,7 +5401,7 @@ stop_world (int generation)
 static int
 restart_world (int generation)
 {
-       int count, i;
+       int count;
        SgenThreadInfo *info;
        TV_DECLARE (end_sw);
        unsigned long usec;
@@ -5134,12 +5414,14 @@ restart_world (int generation)
                }
        }
        mono_profiler_gc_event (MONO_GC_EVENT_PRE_START_WORLD, generation);
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       info->stack_start = NULL;
-                       info->stopped_regs = NULL;
-               }
-       }
+       FOREACH_THREAD (info) {
+               info->stack_start = NULL;
+#ifdef USE_MONO_CTX
+               info->monoctx = NULL;
+#else
+               info->stopped_regs = NULL;
+#endif
+       } END_FOREACH_THREAD
 
        release_gc_locks ();
 
@@ -5184,10 +5466,12 @@ mono_gc_conservatively_scan_area (void *start, void *end)
 void*
 mono_gc_scan_object (void *obj)
 {
+       UserCopyOrMarkData *data = pthread_getspecific (user_copy_or_mark_key);
+
        if (current_collection_generation == GENERATION_NURSERY)
-               major_collector.copy_object (&obj, &gray_queue);
+               major_collector.copy_object (&obj, data->queue);
        else
-               major_collector.copy_or_mark_object (&obj, &gray_queue);
+               major_collector.copy_or_mark_object (&obj, data->queue);
        return obj;
 }
 
@@ -5195,54 +5479,68 @@ mono_gc_scan_object (void *obj)
  * Mark from thread stacks and registers.
  */
 static void
-scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise)
+scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise, GrayQueue *queue)
 {
-       int i;
        SgenThreadInfo *info;
 
        scan_area_arg_start = start_nursery;
        scan_area_arg_end = end_nursery;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       if (info->skip) {
-                               DEBUG (3, fprintf (gc_debug_file, "Skipping dead thread %p, range: %p-%p, size: %td\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start));
-                               continue;
-                       }
-                       DEBUG (3, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %td, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
-                       if (gc_callbacks.thread_mark_func && !conservative_stack_mark)
-                               gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
-                       else if (!precise)
-                               conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
-
-                       if (!precise)
-                               conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
-                                               start_nursery, end_nursery, PIN_TYPE_STACK);
+       FOREACH_THREAD (info) {
+               if (info->skip) {
+                       DEBUG (3, fprintf (gc_debug_file, "Skipping dead thread %p, range: %p-%p, size: %td\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start));
+                       continue;
                }
-       }
+               DEBUG (3, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %td, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
+               if (gc_callbacks.thread_mark_func && !conservative_stack_mark) {
+                       UserCopyOrMarkData data = { NULL, queue };
+                       set_user_copy_or_mark_data (&data);
+                       gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
+                       set_user_copy_or_mark_data (NULL);
+               } else if (!precise) {
+                       conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
+               }
+
+#ifdef USE_MONO_CTX
+               if (!precise)
+                       conservatively_pin_objects_from ((void**)info->monoctx, (void**)info->monoctx + ARCH_NUM_REGS,
+                               start_nursery, end_nursery, PIN_TYPE_STACK);
+#else
+               if (!precise)
+                       conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
+                                       start_nursery, end_nursery, PIN_TYPE_STACK);
+#endif
+       } END_FOREACH_THREAD
 }
 
 static void
 find_pinning_ref_from_thread (char *obj, size_t size)
 {
-       int i;
+       int j;
        SgenThreadInfo *info;
        char *endobj = obj + size;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       char **start = (char**)info->stack_start;
-                       if (info->skip)
-                               continue;
-                       while (start < (char**)info->stack_end) {
-                               if (*start >= obj && *start < endobj) {
-                                       DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in thread %p (id %p) at %p, stack: %p-%p\n", obj, info, (gpointer)info->id, start, info->stack_start, info->stack_end));
-                               }
-                               start++;
+       FOREACH_THREAD (info) {
+               char **start = (char**)info->stack_start;
+               if (info->skip)
+                       continue;
+               while (start < (char**)info->stack_end) {
+                       if (*start >= obj && *start < endobj) {
+                               DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in thread %p (id %p) at %p, stack: %p-%p\n", obj, info, (gpointer)mono_thread_info_get_tid (info), start, info->stack_start, info->stack_end));
                        }
-
-                       /* FIXME: check info->stopped_regs */
+                       start++;
                }
+
+               for (j = 0; j < ARCH_NUM_REGS; ++j) {
+#ifdef USE_MONO_CTX
+                       mword w = ((mword*)info->monoctx) [j];
+#else
+                       mword w = (mword)info->stopped_regs [j];
+#endif
+
+                       if (w >= (mword)obj && w < (mword)obj + size)
+                               DEBUG (0, fprintf (gc_debug_file, "Object %p referenced in saved reg %d of thread %p (id %p)\n", obj, j, info, (gpointer)mono_thread_info_get_tid (info)));
+               } END_FOREACH_THREAD
        }
 }
 
@@ -5250,7 +5548,7 @@ static gboolean
 ptr_on_stack (void *ptr)
 {
        gpointer stack_start = &stack_start;
-       SgenThreadInfo *info = mono_sgen_thread_info_lookup (ARCH_GET_THREAD ());
+       SgenThreadInfo *info = mono_thread_info_current ();
 
        if (ptr >= stack_start && ptr < (gpointer)info->stack_end)
                return TRUE;
@@ -5286,7 +5584,7 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                                 * becomes part of the global remset, which can grow very large.
                                 */
                                DEBUG (9, fprintf (gc_debug_file, "Add to global remset because of pinning %p (%p %s)\n", ptr, *ptr, safe_name (*ptr)));
-                               mono_sgen_add_to_global_remset (ptr);
+                               mono_sgen_add_to_global_remset (queue->allocator, ptr);
                        }
                } else {
                        DEBUG (9, fprintf (gc_debug_file, "Skipping remset at %p holding %p\n", ptr, *ptr));
@@ -5301,7 +5599,7 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                        major_collector.copy_object (ptr, queue);
                        DEBUG (9, fprintf (gc_debug_file, "Overwrote remset at %p with %p (count: %d)\n", ptr, *ptr, (int)count));
                        if (!global && *ptr >= start_nursery && *ptr < end_nursery)
-                               mono_sgen_add_to_global_remset (ptr);
+                               mono_sgen_add_to_global_remset (queue->allocator, ptr);
                        ++ptr;
                }
                return p + 2;
@@ -5377,12 +5675,10 @@ remset_stats (void)
        int i;
        mword *addresses, *bumper, *p, *r;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = remset->next)
-                               size += remset->store_next - remset->data;
-               }
-       }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = remset->next)
+                       size += remset->store_next - remset->data;
+       } END_FOREACH_THREAD
        for (remset = freed_thread_remsets; remset; remset = remset->next)
                size += remset->store_next - remset->data;
        for (remset = global_remset; remset; remset = remset->next)
@@ -5390,12 +5686,10 @@ remset_stats (void)
 
        bumper = addresses = mono_sgen_alloc_internal_dynamic (sizeof (mword) * size, INTERNAL_MEM_STATISTICS);
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = remset->next)
-                               bumper = collect_store_remsets (remset, bumper);
-               }
-       }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = remset->next)
+                       bumper = collect_store_remsets (remset, bumper);
+       } END_FOREACH_THREAD
        for (remset = global_remset; remset; remset = remset->next)
                bumper = collect_store_remsets (remset, bumper);
        for (remset = freed_thread_remsets; remset; remset = remset->next)
@@ -5434,18 +5728,11 @@ remset_byte_size (RememberedSet *remset)
 }
 
 static void
-scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
+scan_from_global_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
 {
-       int i;
-       SgenThreadInfo *info;
        RememberedSet *remset;
-       GenericStoreRememberedSet *store_remset;
        mword *p, *next_p, *store_pos;
 
-#ifdef HEAVY_STATISTICS
-       remset_stats ();
-#endif
-
        /* the global one */
        for (remset = global_remset; remset; remset = remset->next) {
                DEBUG (4, fprintf (gc_debug_file, "Scanning global remset range: %p-%p, size: %td\n", remset->data, remset->store_next, remset->store_next - remset->data));
@@ -5477,6 +5764,20 @@ scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
                /* Truncate the remset */
                remset->store_next = store_pos;
        }
+}
+
+static void
+scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
+{
+       int i;
+       SgenThreadInfo *info;
+       RememberedSet *remset;
+       GenericStoreRememberedSet *store_remset;
+       mword *p;
+
+#ifdef HEAVY_STATISTICS
+       remset_stats ();
+#endif
 
        /* the generic store ones */
        store_remset = generic_store_remsets;
@@ -5496,27 +5797,25 @@ scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
        generic_store_remsets = NULL;
 
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       RememberedSet *next;
-                       int j;
-                       for (remset = info->remset; remset; remset = next) {
-                               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
-                               for (p = remset->data; p < remset->store_next;)
-                                       p = handle_remset (p, start_nursery, end_nursery, FALSE, queue);
-                               remset->store_next = remset->data;
-                               next = remset->next;
-                               remset->next = NULL;
-                               if (remset != info->remset) {
-                                       DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
-                               }
+       FOREACH_THREAD (info) {
+               RememberedSet *next;
+               int j;
+               for (remset = info->remset; remset; remset = next) {
+                       DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
+                       for (p = remset->data; p < remset->store_next;)
+                               p = handle_remset (p, start_nursery, end_nursery, FALSE, queue);
+                       remset->store_next = remset->data;
+                       next = remset->next;
+                       remset->next = NULL;
+                       if (remset != info->remset) {
+                               DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
+                               mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
                        }
-                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j)
-                               handle_remset ((mword*)*info->store_remset_buffer_addr + j + 1, start_nursery, end_nursery, FALSE, queue);
-                       clear_thread_store_remset_buffer (info);
                }
-       }
+               for (j = 0; j < *info->store_remset_buffer_index_addr; ++j)
+                       handle_remset ((mword*)*info->store_remset_buffer_addr + j + 1, start_nursery, end_nursery, FALSE, queue);
+               clear_thread_store_remset_buffer (info);
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        while (freed_thread_remsets) {
@@ -5540,7 +5839,6 @@ scan_from_remsets (void *start_nursery, void *end_nursery, GrayQueue *queue)
 static void
 clear_remsets (void)
 {
-       int i;
        SgenThreadInfo *info;
        RememberedSet *remset, *next;
 
@@ -5551,7 +5849,8 @@ clear_remsets (void)
                remset->next = NULL;
                if (remset != global_remset) {
                        DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
+                       mono_sgen_free_internal_dynamic_delayed (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET,
+                                       mono_sgen_get_unmanaged_allocator ());
                }
        }
        /* the generic store ones */
@@ -5561,20 +5860,18 @@ clear_remsets (void)
                generic_store_remsets = gs_next;
        }
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       for (remset = info->remset; remset; remset = next) {
-                               remset->store_next = remset->data;
-                               next = remset->next;
-                               remset->next = NULL;
-                               if (remset != info->remset) {
-                                       DEBUG (3, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
-                               }
+       FOREACH_THREAD (info) {
+               for (remset = info->remset; remset; remset = next) {
+                       remset->store_next = remset->data;
+                       next = remset->next;
+                       remset->next = NULL;
+                       if (remset != info->remset) {
+                               DEBUG (3, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
+                               mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
                        }
-                       clear_thread_store_remset_buffer (info);
                }
-       }
+               clear_thread_store_remset_buffer (info);
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        while (freed_thread_remsets) {
@@ -5592,33 +5889,24 @@ static void
 clear_tlabs (void)
 {
        SgenThreadInfo *info;
-       int i;
 
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       /* A new TLAB will be allocated when the thread does its first allocation */
-                       *info->tlab_start_addr = NULL;
-                       *info->tlab_next_addr = NULL;
-                       *info->tlab_temp_end_addr = NULL;
-                       *info->tlab_real_end_addr = NULL;
-               }
-       }
+       FOREACH_THREAD (info) {
+               /* A new TLAB will be allocated when the thread does its first allocation */
+               *info->tlab_start_addr = NULL;
+               *info->tlab_next_addr = NULL;
+               *info->tlab_temp_end_addr = NULL;
+               *info->tlab_real_end_addr = NULL;
+       } END_FOREACH_THREAD
 }
 
-/* LOCKING: assumes the GC lock is held */
-static SgenThreadInfo*
-gc_register_current_thread (void *addr)
+static void*
+sgen_thread_register (SgenThreadInfo* info, void *addr)
 {
-       int hash;
-       SgenThreadInfo* info = malloc (sizeof (SgenThreadInfo));
 #ifndef HAVE_KW_THREAD
        SgenThreadInfo *__thread_info__ = info;
 #endif
 
-       if (!info)
-               return NULL;
-
-       memset (info, 0, sizeof (SgenThreadInfo));
+       LOCK_GC;
 #ifndef HAVE_KW_THREAD
        info->tlab_start = info->tlab_next = info->tlab_temp_end = info->tlab_real_end = NULL;
 
@@ -5628,7 +5916,6 @@ gc_register_current_thread (void *addr)
        thread_info = info;
 #endif
 
-       info->id = ARCH_GET_THREAD ();
        info->stop_count = -1;
        info->skip = 0;
        info->signal = 0;
@@ -5641,15 +5928,23 @@ gc_register_current_thread (void *addr)
        info->store_remset_buffer_index_addr = &STORE_REMSET_BUFFER_INDEX;
        info->stopped_ip = NULL;
        info->stopped_domain = NULL;
+#ifdef USE_MONO_CTX
+       info->monoctx = NULL;
+#else
        info->stopped_regs = NULL;
+#endif
 
-       binary_protocol_thread_register ((gpointer)info->id);
+       binary_protocol_thread_register ((gpointer)mono_thread_info_get_tid (info));
 
 #ifdef HAVE_KW_THREAD
        tlab_next_addr = &tlab_next;
        store_remset_buffer_index_addr = &store_remset_buffer_index;
 #endif
 
+#if defined(__MACH__)
+       info->mach_port = mach_thread_self ();
+#endif
+
        /* try to get it with attributes first */
 #if defined(HAVE_PTHREAD_GETATTR_NP) && defined(HAVE_PTHREAD_ATTR_GETSTACK)
        {
@@ -5679,11 +5974,6 @@ gc_register_current_thread (void *addr)
        stack_end = info->stack_end;
 #endif
 
-       /* hash into the table */
-       hash = HASH_PTHREAD_T (info->id) % THREAD_HASH_SIZE;
-       info->next = thread_table [hash];
-       thread_table [hash] = info;
-
        info->remset = alloc_remset (DEFAULT_REMSET_SIZE, info);
        pthread_setspecific (remembered_set_key, info->remset);
 #ifdef HAVE_KW_THREAD
@@ -5693,11 +5983,12 @@ gc_register_current_thread (void *addr)
        STORE_REMSET_BUFFER = mono_sgen_alloc_internal (INTERNAL_MEM_STORE_REMSET);
        STORE_REMSET_BUFFER_INDEX = 0;
 
-       DEBUG (3, fprintf (gc_debug_file, "registered thread %p (%p) (hash: %d)\n", info, (gpointer)info->id, hash));
+       DEBUG (3, fprintf (gc_debug_file, "registered thread %p (%p)\n", info, (gpointer)mono_thread_info_get_tid (info)));
 
        if (gc_callbacks.thread_attach_func)
                info->runtime_data = gc_callbacks.thread_attach_func ();
 
+       UNLOCK_GC;
        return info;
 }
 
@@ -5711,29 +6002,27 @@ add_generic_store_remset_from_buffer (gpointer *buffer)
 }
 
 static void
-unregister_current_thread (void)
+sgen_thread_unregister (SgenThreadInfo *p)
 {
-       int hash;
-       SgenThreadInfo *prev = NULL;
-       SgenThreadInfo *p;
        RememberedSet *rset;
-       ARCH_THREAD_TYPE id = ARCH_GET_THREAD ();
+
+       /* If a delegate is passed to native code and invoked on a thread we dont
+        * know about, the jit will register it with mono_jit_thread_attach, but
+        * we have no way of knowing when that thread goes away.  SGen has a TSD
+        * so we assume that if the domain is still registered, we can detach
+        * the thread
+        */
+       if (mono_domain_get ())
+               mono_thread_detach (mono_thread_current ());
+
+       LOCK_GC;
 
        binary_protocol_thread_unregister ((gpointer)id);
+       DEBUG (3, fprintf (gc_debug_file, "unregister thread %p (%p)\n", p, (gpointer)mono_thread_info_get_tid (p)));
 
-       hash = HASH_PTHREAD_T (id) % THREAD_HASH_SIZE;
-       p = thread_table [hash];
-       assert (p);
-       DEBUG (3, fprintf (gc_debug_file, "unregister thread %p (%p)\n", p, (gpointer)p->id));
-       while (!ARCH_THREAD_EQUALS (p->id, id)) {
-               prev = p;
-               p = p->next;
-       }
-       if (prev == NULL) {
-               thread_table [hash] = p->next;
-       } else {
-               prev->next = p->next;
-       }
+#if defined(__MACH__)
+       mach_port_deallocate (current_task (), p->mach_port);
+#endif
 
        if (gc_callbacks.thread_detach_func) {
                gc_callbacks.thread_detach_func (p->runtime_data);
@@ -5753,101 +6042,60 @@ unregister_current_thread (void)
        if (*p->store_remset_buffer_index_addr)
                add_generic_store_remset_from_buffer (*p->store_remset_buffer_addr);
        mono_sgen_free_internal (*p->store_remset_buffer_addr, INTERNAL_MEM_STORE_REMSET);
-       free (p);
-}
-
-static void
-unregister_thread (void *k)
-{
-       g_assert (!mono_domain_get ());
-       LOCK_GC;
-       unregister_current_thread ();
        UNLOCK_GC;
 }
 
-gboolean
-mono_gc_register_thread (void *baseptr)
-{
-       SgenThreadInfo *info;
 
+static void
+sgen_thread_attach (SgenThreadInfo *info)
+{
        LOCK_GC;
+       /*this is odd, can we get attached before the gc is inited?*/
        init_stats ();
-       info = mono_sgen_thread_info_lookup (ARCH_GET_THREAD ());
-       if (info == NULL) {
-               info = gc_register_current_thread (baseptr);
-       } else {
-               /* The main thread might get registered before callbacks are set */
-               if (gc_callbacks.thread_attach_func && !info->runtime_data)
-                       info->runtime_data = gc_callbacks.thread_attach_func ();
-       }
        UNLOCK_GC;
+       
+       if (gc_callbacks.thread_attach_func && !info->runtime_data)
+               info->runtime_data = gc_callbacks.thread_attach_func ();
 
        /* Need a better place to initialize this */
        if (!array_fill_vtable && mono_get_root_domain ()) {
                array_fill_vtable = mono_class_vtable (mono_get_root_domain (), mono_array_class_get (mono_defaults.byte_class, 1));
        }
-
-       return info != NULL;
+       
+}
+gboolean
+mono_gc_register_thread (void *baseptr)
+{
+       return mono_thread_info_attach (baseptr) != NULL;
 }
 
-#if USE_PTHREAD_INTERCEPT
-
-typedef struct {
-       void *(*start_routine) (void *);
-       void *arg;
-       int flags;
-       MonoSemType registered;
-} SgenThreadStartInfo;
-
-static void*
-gc_start_thread (void *arg)
+/*
+ * mono_gc_set_stack_end:
+ *
+ *   Set the end of the current threads stack to STACK_END. The stack space between 
+ * STACK_END and the real end of the threads stack will not be scanned during collections.
+ */
+void
+mono_gc_set_stack_end (void *stack_end)
 {
-       SgenThreadStartInfo *start_info = arg;
-       SgenThreadInfo* info;
-       void *t_arg = start_info->arg;
-       void *(*start_func) (void*) = start_info->start_routine;
-       void *result;
-       int post_result;
+       SgenThreadInfo *info;
 
        LOCK_GC;
-       info = gc_register_current_thread (&result);
-       UNLOCK_GC;
-       post_result = MONO_SEM_POST (&(start_info->registered));
-       g_assert (!post_result);
-       result = start_func (t_arg);
-       g_assert (!mono_domain_get ());
-       /*
-        * this is done by the pthread key dtor
-       LOCK_GC;
-       unregister_current_thread ();
+       info = mono_thread_info_current ();
+       if (info) {
+               g_assert (stack_end < info->stack_end);
+               info->stack_end = stack_end;
+       }
        UNLOCK_GC;
-       */
-
-       return result;
 }
 
+#if USE_PTHREAD_INTERCEPT
+
+
 int
 mono_gc_pthread_create (pthread_t *new_thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg)
 {
-       SgenThreadStartInfo *start_info;
-       int result;
-
-       start_info = malloc (sizeof (SgenThreadStartInfo));
-       if (!start_info)
-               return ENOMEM;
-       MONO_SEM_INIT (&(start_info->registered), 0);
-       start_info->arg = arg;
-       start_info->start_routine = start_routine;
-
-       result = pthread_create (new_thread, attr, gc_start_thread, start_info);
-       if (result == 0) {
-               while (MONO_SEM_WAIT (&(start_info->registered)) != 0) {
-                       /*if (EINTR != errno) ABORT("sem_wait failed"); */
-               }
-       }
-       MONO_SEM_DESTROY (&(start_info->registered));
-       free (start_info);
-       return result;
+       return mono_threads_pthread_create (new_thread, attr, start_routine, arg);
 }
 
 int
@@ -5889,6 +6137,17 @@ alloc_remset (int size, gpointer id) {
        return res;
 }
 
+static RememberedSet*
+alloc_global_remset (SgenInternalAllocator *alc, int size, gpointer id)
+{
+       RememberedSet* res = mono_sgen_alloc_internal_full (alc, sizeof (RememberedSet) + (size * sizeof (gpointer)), INTERNAL_MEM_REMSET);
+       res->store_next = res->data;
+       res->end_set = res->data + size;
+       res->next = NULL;
+       DEBUG (4, fprintf (gc_debug_file, "Allocated global remset size %d at %p for %p\n", size, res->data, id));
+       return res;
+}
+
 /*
  * Note: the write barriers first do the needed GC work and then do the actual store:
  * this way the value is visible to the conservative GC scan after the write barrier
@@ -5925,7 +6184,7 @@ mono_gc_wbarrier_set_field (MonoObject *obj, gpointer field_ptr, MonoObject* val
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
-               mono_sgen_thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+               mono_thread_info_current ()->remset = rs;
 #endif
                *(rs->store_next++) = (mword)field_ptr;
                *(void**)field_ptr = value;
@@ -5963,7 +6222,7 @@ mono_gc_wbarrier_set_arrayref (MonoArray *arr, gpointer slot_ptr, MonoObject* va
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
-               mono_sgen_thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+               mono_thread_info_current ()->remset = rs;
 #endif
                *(rs->store_next++) = (mword)slot_ptr;
                *(void**)slot_ptr = value;
@@ -6026,7 +6285,7 @@ mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
-               mono_sgen_thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+               mono_thread_info_current ()->remset = rs;
 #endif
                *(rs->store_next++) = (mword)dest_ptr | REMSET_RANGE;
                *(rs->store_next++) = count;
@@ -6198,7 +6457,7 @@ mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *
                sgen_card_table_mark_range ((mword)dest, size);
        } else {
                rs = REMEMBERED_SET;
-               if (ptr_in_nursery (dest) || ptr_on_stack (dest) || !klass->has_references) {
+               if (ptr_in_nursery (dest) || ptr_on_stack (dest) || !SGEN_CLASS_HAS_REFERENCES (klass)) {
                        UNLOCK_GC;
                        return;
                }
@@ -6215,9 +6474,8 @@ mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *
                rs = alloc_remset (rs->end_set - rs->data, (void*)1);
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
-#ifdef HAVE_KW_THREAD
-               mono_sgen_thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
-#endif
+
+               mono_thread_info_current ()->remset = rs;
                *(rs->store_next++) = (mword)dest | REMSET_VTYPE;
                *(rs->store_next++) = (mword)klass->gc_descr;
                *(rs->store_next++) = (mword)count;
@@ -6257,9 +6515,9 @@ mono_gc_wbarrier_object_copy (MonoObject* obj, MonoObject *src)
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
        rs->next = REMEMBERED_SET;
        REMEMBERED_SET = rs;
-#ifdef HAVE_KW_THREAD
-       mono_sgen_thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
-#endif
+
+       mono_thread_info_current ()->remset = rs;
+
        *(rs->store_next++) = (mword)obj | REMSET_OBJECT;
        UNLOCK_GC;
 }
@@ -6423,23 +6681,21 @@ find_in_remsets (char *addr)
        }
 
        /* the per-thread ones */
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       int j;
-                       for (remset = info->remset; remset; remset = remset->next) {
-                               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
-                               for (p = remset->data; p < remset->store_next;) {
-                                       p = find_in_remset_loc (p, addr, &found);
-                                       if (found)
-                                               return TRUE;
-                               }
-                       }
-                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j) {
-                               if ((*info->store_remset_buffer_addr) [j + 1] == addr)
+       FOREACH_THREAD (info) {
+               int j;
+               for (remset = info->remset; remset; remset = remset->next) {
+                       DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %td\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
+                       for (p = remset->data; p < remset->store_next;) {
+                               p = find_in_remset_loc (p, addr, &found);
+                               if (found)
                                        return TRUE;
                        }
                }
-       }
+               for (j = 0; j < *info->store_remset_buffer_index_addr; ++j) {
+                       if ((*info->store_remset_buffer_addr) [j + 1] == addr)
+                               return TRUE;
+               }
+       } END_FOREACH_THREAD
 
        /* the freed thread ones */
        for (remset = freed_thread_remsets; remset; remset = remset->next) {
@@ -6509,9 +6765,7 @@ check_consistency (void)
 
        DEBUG (1, fprintf (gc_debug_file, "Heap consistency check done.\n"));
 
-#ifdef SGEN_BINARY_PROTOCOL
-       if (!binary_protocol_file)
-#endif
+       if (!binary_protocol_is_enabled ())
                g_assert (!missing_remsets);
 }
 
@@ -6801,12 +7055,17 @@ mono_gc_make_root_descr_all_refs (int numbits)
 {
        gsize *gc_bitmap;
        void *descr;
+       int num_bytes = numbits / 8;
 
        if (numbits < 32 && all_ref_root_descrs [numbits])
                return all_ref_root_descrs [numbits];
 
-       gc_bitmap = g_malloc0 (ALIGN_TO (numbits, 8) + 1);
-       memset (gc_bitmap, 0xff, numbits / 8);
+       gc_bitmap = g_malloc0 (ALIGN_TO (ALIGN_TO (numbits, 8) + 1, sizeof (gsize)));
+       memset (gc_bitmap, 0xff, num_bytes);
+       if (numbits < ((sizeof (*gc_bitmap) * 8) - ROOT_DESC_TYPE_SHIFT)) 
+               gc_bitmap[0] = GUINT64_TO_LE(gc_bitmap[0]);
+       else if (numbits && num_bytes % (sizeof (*gc_bitmap)))
+               gc_bitmap[num_bytes / 8] = GUINT64_TO_LE(gc_bitmap [num_bytes / 8]);
        if (numbits % 8)
                gc_bitmap [numbits / 8] = (1 << (numbits % 8)) - 1;
        descr = mono_gc_make_descr_from_bitmap (gc_bitmap, numbits);
@@ -6866,7 +7125,7 @@ mono_gc_is_gc_thread (void)
 {
        gboolean result;
        LOCK_GC;
-        result = mono_sgen_thread_info_lookup (ARCH_GET_THREAD ()) != NULL;
+       result = mono_thread_info_current () != NULL;
        UNLOCK_GC;
        return result;
 }
@@ -6874,15 +7133,13 @@ mono_gc_is_gc_thread (void)
 void
 mono_gc_base_init (void)
 {
+       MonoThreadInfoCallbacks cb;
        char *env;
        char **opts, **ptr;
        char *major_collector_opt = NULL;
        struct sigaction sinfo;
        glong max_heap = 0;
-
-#ifdef PLATFORM_ANDROID
-       g_assert_not_reached ();
-#endif
+       int num_workers;
 
        /* the gc_initialized guard seems to imply this method is
           idempotent, but LOCK_INIT(gc_mutex) might not be.  It's
@@ -6895,7 +7152,12 @@ mono_gc_base_init (void)
                return;
        }
        pagesize = mono_pagesize ();
-       gc_debug_file = stderr;
+       gc_debug_file = stdout;
+
+       cb.thread_register = sgen_thread_register;
+       cb.thread_unregister = sgen_thread_unregister;
+       cb.thread_attach = sgen_thread_attach;
+       mono_threads_init (&cb, sizeof (SgenThreadInfo));
 
        LOCK_INIT (interruption_mutex);
        LOCK_INIT (global_remset_mutex);
@@ -6933,10 +7195,8 @@ mono_gc_base_init (void)
                mono_sgen_marksweep_fixed_init (&major_collector);
        } else if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep-par")) {
                mono_sgen_marksweep_par_init (&major_collector);
-               workers_init (mono_cpu_count ());
        } else if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep-fixed-par")) {
                mono_sgen_marksweep_fixed_par_init (&major_collector);
-               workers_init (mono_cpu_count ());
        } else if (!strcmp (major_collector_opt, "copying")) {
                mono_sgen_copying_init (&major_collector);
        } else {
@@ -6950,6 +7210,14 @@ mono_gc_base_init (void)
        use_cardtable = FALSE;
 #endif
 
+       num_workers = mono_cpu_count ();
+       g_assert (num_workers > 0);
+       if (num_workers > 16)
+               num_workers = 16;
+
+       /* Keep this the default for now */
+       conservative_stack_mark = TRUE;
+
        if (opts) {
                for (ptr = opts; *ptr; ++ptr) {
                        char *opt = *ptr;
@@ -6983,6 +7251,38 @@ mono_gc_base_init (void)
                                }
                                continue;
                        }
+                       if (g_str_has_prefix (opt, "workers=")) {
+                               long val;
+                               char *endptr;
+                               if (!major_collector.is_parallel) {
+                                       fprintf (stderr, "The workers= option can only be used for parallel collectors.");
+                                       exit (1);
+                               }
+                               opt = strchr (opt, '=') + 1;
+                               val = strtol (opt, &endptr, 10);
+                               if (!*opt || *endptr) {
+                                       fprintf (stderr, "Cannot parse the workers= option value.");
+                                       exit (1);
+                               }
+                               if (val <= 0 || val > 16) {
+                                       fprintf (stderr, "The number of workers must be in the range 1 to 16.");
+                                       exit (1);
+                               }
+                               num_workers = (int)val;
+                               continue;
+                       }
+                       if (g_str_has_prefix (opt, "stack-mark=")) {
+                               opt = strchr (opt, '=') + 1;
+                               if (!strcmp (opt, "precise")) {
+                                       conservative_stack_mark = FALSE;
+                               } else if (!strcmp (opt, "conservative")) {
+                                       conservative_stack_mark = TRUE;
+                               } else {
+                                       fprintf (stderr, "Invalid value '%s' for stack-mark= option, possible values are: 'precise', 'conservative'.\n", opt);
+                                       exit (1);
+                               }
+                               continue;
+                       }
 #ifdef USER_CONFIG
                        if (g_str_has_prefix (opt, "nursery-size=")) {
                                long val;
@@ -7012,6 +7312,7 @@ mono_gc_base_init (void)
                                fprintf (stderr, "  nursery-size=N (where N is an integer, possibly with a k, m or a g suffix)\n");
                                fprintf (stderr, "  major=COLLECTOR (where COLLECTOR is `marksweep', `marksweep-par' or `copying')\n");
                                fprintf (stderr, "  wbarrier=WBARRIER (where WBARRIER is `remset' or `cardtable')\n");
+                               fprintf (stderr, "  stack-mark=MARK-METHOD (where MARK-METHOD is 'precise' or 'conservative')\n");
                                if (major_collector.print_gc_param_usage)
                                        major_collector.print_gc_param_usage ();
                                exit (1);
@@ -7020,6 +7321,9 @@ mono_gc_base_init (void)
                g_strfreev (opts);
        }
 
+       if (major_collector.is_parallel)
+               workers_init (num_workers);
+
        if (major_collector_opt)
                g_free (major_collector_opt);
 
@@ -7057,12 +7361,14 @@ mono_gc_base_init (void)
                                xdomain_checks = TRUE;
                        } else if (!strcmp (opt, "clear-at-gc")) {
                                nursery_clear_policy = CLEAR_AT_GC;
-                       } else if (!strcmp (opt, "conservative-stack-mark")) {
-                               conservative_stack_mark = TRUE;
                        } else if (!strcmp (opt, "clear-nursery-at-gc")) {
                                nursery_clear_policy = CLEAR_AT_GC;
                        } else if (!strcmp (opt, "check-scan-starts")) {
                                do_scan_starts_check = TRUE;
+                       } else if (!strcmp (opt, "disable-minor")) {
+                               disable_minor_collections = TRUE;
+                       } else if (!strcmp (opt, "disable-major")) {
+                               disable_major_collections = TRUE;
                        } else if (g_str_has_prefix (opt, "heap-dump=")) {
                                char *filename = strchr (opt, '=') + 1;
                                nursery_clear_policy = CLEAR_AT_GC;
@@ -7072,18 +7378,27 @@ mono_gc_base_init (void)
 #ifdef SGEN_BINARY_PROTOCOL
                        } else if (g_str_has_prefix (opt, "binary-protocol=")) {
                                char *filename = strchr (opt, '=') + 1;
-                               binary_protocol_file = fopen (filename, "w");
+                               binary_protocol_init (filename);
 #endif
                        } else {
                                fprintf (stderr, "Invalid format for the MONO_GC_DEBUG env variable: '%s'\n", env);
                                fprintf (stderr, "The format is: MONO_GC_DEBUG=[l[:filename]|<option>]+ where l is a debug level 0-9.\n");
-                               fprintf (stderr, "Valid options are: collect-before-allocs[=<n>], check-at-minor-collections, xdomain-checks, clear-at-gc, conservative-stack-mark.\n");
+                               fprintf (stderr, "Valid options are:\n");
+                               fprintf (stderr, "  collect-before-allocs[=<n>]\n");
+                               fprintf (stderr, "  check-at-minor-collections\n");
+                               fprintf (stderr, "  disable-minor\n");
+                               fprintf (stderr, "  disable-major\n");
+                               fprintf (stderr, "  xdomain-checks\n");
+                               fprintf (stderr, "  clear-at-gc\n");
                                exit (1);
                        }
                }
                g_strfreev (opts);
        }
 
+       if (major_collector.post_param_init)
+               major_collector.post_param_init ();
+
        suspend_ack_semaphore_ptr = &suspend_ack_semaphore;
        MONO_SEM_INIT (&suspend_ack_semaphore, 0);
 
@@ -7105,7 +7420,7 @@ mono_gc_base_init (void)
        global_remset = alloc_remset (1024, NULL);
        global_remset->next = NULL;
 
-       pthread_key_create (&remembered_set_key, unregister_thread);
+       pthread_key_create (&remembered_set_key, NULL);
 
 #ifndef HAVE_KW_THREAD
        pthread_key_create (&thread_info_key, NULL);
@@ -7116,7 +7431,7 @@ mono_gc_base_init (void)
 
        gc_initialized = TRUE;
        UNLOCK_GC;
-       mono_gc_register_thread (&sinfo);
+       mono_thread_info_attach (&sinfo);
 }
 
 int
@@ -7139,6 +7454,12 @@ enum {
        mono_mb_emit_i4 ((mb), (offset));               \
        } while (0)
 #else
+
+/* 
+ * CEE_MONO_TLS requires the tls offset, not the key, so the code below only works on darwin,
+ * where the two are the same.
+ */
+#ifdef __APPLE__
 #define EMIT_TLS_ACCESS(mb,member,dummy)       do {    \
        mono_mb_emit_byte ((mb), MONO_CUSTOM_PREFIX);   \
        mono_mb_emit_byte ((mb), CEE_MONO_TLS);         \
@@ -7147,6 +7468,10 @@ enum {
        mono_mb_emit_byte ((mb), CEE_ADD);              \
        mono_mb_emit_byte ((mb), CEE_LDIND_I);          \
        } while (0)
+#else
+#define EMIT_TLS_ACCESS(mb,member,dummy)       do { g_error ("sgen is not supported when using --with-tls=pthread.\n"); } while (0)
+#endif
+
 #endif
 
 #ifdef MANAGED_ALLOCATION
@@ -7488,7 +7813,7 @@ mono_gc_get_managed_array_allocator (MonoVTable *vtable, int rank)
                return NULL;
        if (collect_before_allocs)
                return NULL;
-       g_assert (!klass->has_finalize && !klass->marshalbyref);
+       g_assert (!mono_class_has_finalizer (klass) && !klass->marshalbyref);
 
        return mono_gc_get_managed_allocator_by_type (ATYPE_VECTOR);
 #else
@@ -7594,9 +7919,9 @@ mono_gc_get_write_barrier (void)
                mono_mb_emit_ptr (mb, (gpointer) nursery_start);
                label_continue_1 = mono_mb_emit_branch (mb, CEE_BLT);
 
-               // if (ptr >= nursery_real_end)) goto continue;
+               // if (ptr >= nursery_end)) goto continue;
                mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_real_end);
+               mono_mb_emit_ptr (mb, (gpointer) nursery_end);
                label_continue_2 = mono_mb_emit_branch (mb, CEE_BGE);
 
                // Otherwise return
@@ -7619,7 +7944,7 @@ mono_gc_get_write_barrier (void)
 
                // if (*ptr >= nursery_end) return;
                mono_mb_emit_ldloc (mb, dereferenced_var);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_real_end);
+               mono_mb_emit_ptr (mb, (gpointer) nursery_end);
                label_no_wb_5 = mono_mb_emit_branch (mb, CEE_BGE);
 
 #endif 
@@ -7756,4 +8081,17 @@ mono_sgen_debug_printf (int level, const char *format, ...)
        va_end (ap);
 }
 
+FILE*
+mono_sgen_get_logfile (void)
+{
+       return gc_debug_file;
+}
+
+#ifdef HOST_WIN32
+BOOL APIENTRY mono_gc_dllmain (HMODULE module_handle, DWORD reason, LPVOID reserved)
+{
+       return TRUE;
+}
+#endif
+
 #endif /* HAVE_SGEN_GC */