[sgen] Debug option for printing the heap usage and minor collection allowance.
[mono.git] / mono / metadata / sgen-gc.c
index ce4cb969857e2ad1408c831826b5b89053001674..d1ccb878c54de32b5618f1695efe1662493022f6 100644 (file)
@@ -3,6 +3,7 @@
  *
  * Author:
  *     Paolo Molaro (lupus@ximian.com)
+ *  Rodrigo Kumpera (kumpera@gmail.com)
  *
  * Copyright 2005-2010 Novell, Inc (http://www.novell.com)
  *
@@ -24,6 +25,7 @@
  *
  * Copyright 2001-2003 Ximian, Inc
  * Copyright 2003-2010 Novell, Inc.
+ * Copyright 2011 Xamarin, Inc.
  * 
  * Permission is hereby granted, free of charge, to any person obtaining
  * a copy of this software and associated documentation files (the
        A good place to start is add_nursery_frag. The tricky thing here is
        placing those objects atomically outside of a collection.
 
-
+ *) Allocation should use asymmetric Dekker synchronization:
+       http://blogs.oracle.com/dave/resource/Asymmetric-Dekker-Synchronization.txt
+       This should help weak consistency archs.
  */
 #include "config.h"
 #ifdef HAVE_SGEN_GC
 #include "metadata/threadpool-internals.h"
 #include "metadata/mempool-internals.h"
 #include "metadata/marshal.h"
+#include "metadata/runtime.h"
 #include "utils/mono-mmap.h"
 #include "utils/mono-time.h"
 #include "utils/mono-semaphore.h"
 #include "utils/mono-counters.h"
 #include "utils/mono-proclib.h"
+#include "utils/mono-memory-model.h"
 
 #include <mono/utils/memcheck.h>
 
@@ -243,7 +249,8 @@ enum {
  * ######################################################################
  */
 
-static int gc_initialized = 0;
+/* 0 means not initialized, 1 is initialized, -1 means in progress */
+static gint32 gc_initialized = 0;
 /* If set, do a minor collection before every X allocation */
 static guint32 collect_before_allocs = 0;
 /* If set, do a heap consistency check before each minor collection */
@@ -257,6 +264,7 @@ static gboolean conservative_stack_mark = FALSE;
 /* If set, do a plausibility check on the scan_starts before and after
    each collection */
 static gboolean do_scan_starts_check = FALSE;
+static gboolean nursery_collection_is_parallel = FALSE;
 static gboolean disable_minor_collections = FALSE;
 static gboolean disable_major_collections = FALSE;
 
@@ -289,9 +297,6 @@ static long long stat_global_remsets_readded = 0;
 static long long stat_global_remsets_processed = 0;
 static long long stat_global_remsets_discarded = 0;
 
-static long long stat_wasted_fragments_used = 0;
-static long long stat_wasted_fragments_bytes = 0;
-
 static int stat_wbarrier_set_field = 0;
 static int stat_wbarrier_set_arrayref = 0;
 static int stat_wbarrier_arrayref_copy = 0;
@@ -332,6 +337,7 @@ static long long time_major_fragment_creation = 0;
 
 int gc_debug_level = 0;
 FILE* gc_debug_file;
+static gboolean debug_print_allowance = FALSE;
 
 /*
 void
@@ -355,42 +361,8 @@ mono_gc_flush_info (void)
 
 #define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
-/* The method used to clear the nursery */
-/* Clearing at nursery collections is the safest, but has bad interactions with caches.
- * Clearing at TLAB creation is much faster, but more complex and it might expose hard
- * to find bugs.
- */
-typedef enum {
-       CLEAR_AT_GC,
-       CLEAR_AT_TLAB_CREATION
-} NurseryClearPolicy;
-
 static NurseryClearPolicy nursery_clear_policy = CLEAR_AT_TLAB_CREATION;
 
-/*
- * The young generation is divided into fragments. This is because
- * we can hand one fragments to a thread for lock-less fast alloc and
- * because the young generation ends up fragmented anyway by pinned objects.
- * Once a collection is done, a list of fragments is created. When doing
- * thread local alloc we use smallish nurseries so we allow new threads to
- * allocate memory from gen0 without triggering a collection. Threads that
- * are found to allocate lots of memory are given bigger fragments. This
- * should make the finalizer thread use little nursery memory after a while.
- * We should start assigning threads very small fragments: if there are many
- * threads the nursery will be full of reserved space that the threads may not
- * use at all, slowing down allocation speed.
- * Thread local allocation is done from areas of memory Hotspot calls Thread Local 
- * Allocation Buffers (TLABs).
- */
-typedef struct _Fragment Fragment;
-
-struct _Fragment {
-       Fragment *next;
-       char *fragment_start;
-       char *fragment_limit; /* the current soft limit for allocation */
-       char *fragment_end;
-};
-
 /* the runtime can register areas of memory as roots: we keep two lists of roots,
  * a pinned root set for conservatively scanned roots and a normal one for
  * precisely scanned roots (currently implemented as a single list).
@@ -408,7 +380,7 @@ struct _RootRecord {
  * NULL to simplify the elimination of consecutive duplicate
  * entries.
  */
-#define STORE_REMSET_BUFFER_SIZE       1024
+#define STORE_REMSET_BUFFER_SIZE       1023
 
 typedef struct _GenericStoreRememberedSet GenericStoreRememberedSet;
 struct _GenericStoreRememberedSet {
@@ -423,7 +395,7 @@ enum {
        REMSET_LOCATION, /* just a pointer to the exact location */
        REMSET_RANGE,    /* range of pointer fields */
        REMSET_OBJECT,   /* mark all the object for scanning */
-       REMSET_VTYPE,    /* a valuetype array described by a gc descriptor and a count */
+       REMSET_VTYPE,    /* a valuetype array described by a gc descriptor, a count and a size */
        REMSET_TYPE_MASK = 0x3
 };
 
@@ -442,8 +414,7 @@ static gpointer global_remset_cache [2];
  * and doesn't waste any alloc paddin space.
  */
 #define DEFAULT_REMSET_SIZE 1024
-static RememberedSet* alloc_remset (int size, gpointer id);
-static RememberedSet* alloc_global_remset (SgenInternalAllocator *alc, int size, gpointer id);
+static RememberedSet* alloc_remset (int size, gpointer id, gboolean global);
 
 #define object_is_forwarded    SGEN_OBJECT_IS_FORWARDED
 #define object_is_pinned       SGEN_OBJECT_IS_PINNED
@@ -509,9 +480,6 @@ static int default_nursery_bits = 22;
 
 #define SCAN_START_SIZE        SGEN_SCAN_START_SIZE
 
-/* the minimum size of a fragment that we consider useful for allocation */
-#define FRAGMENT_MIN_SIZE (512)
-
 static mword pagesize = 4096;
 static mword nursery_size;
 static int degraded_mode = 0;
@@ -587,19 +555,10 @@ int current_collection_generation = -1;
 #define DISLINK_OBJECT(d)      (REVEAL_POINTER (*(d)->link))
 #define DISLINK_TRACK(d)       ((~(gulong)(*(d)->link)) & 1)
 
-/*
- * The finalizable hash has the object as the key, the 
- * disappearing_link hash, has the link address as key.
- */
-static FinalizeEntryHashTable minor_finalizable_hash;
-static FinalizeEntryHashTable major_finalizable_hash;
 /* objects that are ready to be finalized */
 static FinalizeEntry *fin_ready_list = NULL;
 static FinalizeEntry *critical_fin_list = NULL;
 
-static DisappearingLinkHashTable minor_disappearing_link_hash;
-static DisappearingLinkHashTable major_disappearing_link_hash;
-
 static EphemeronLinkNode *ephemeron_list;
 
 static int num_ready_finalizers = 0;
@@ -667,10 +626,8 @@ add_profile_gc_root (GCRootReport *report, void *object, int rtype, uintptr_t ex
  * MAX(nursery_last_pinned_end, nursery_frag_real_end)
  */
 static char *nursery_start = NULL;
-static char *nursery_next = NULL;
-static char *nursery_frag_real_end = NULL;
 static char *nursery_end = NULL;
-static char *nursery_last_pinned_end = NULL;
+static char *nursery_alloc_bound = NULL;
 
 #ifdef HAVE_KW_THREAD
 #define TLAB_ACCESS_INIT
@@ -695,9 +652,18 @@ static pthread_key_t thread_info_key;
 #define IN_CRITICAL_REGION (__thread_info__->in_critical_region)
 #endif
 
-/* we use the memory barrier only to prevent compiler reordering (a memory constraint may be enough) */
-#define ENTER_CRITICAL_REGION do {IN_CRITICAL_REGION = 1;mono_memory_barrier ();} while (0)
-#define EXIT_CRITICAL_REGION  do {IN_CRITICAL_REGION = 0;mono_memory_barrier ();} while (0)
+#ifndef DISABLE_CRITICAL_REGION
+
+/* Enter must be visible before anything is done in the critical region. */
+#define ENTER_CRITICAL_REGION do { mono_atomic_store_release (&IN_CRITICAL_REGION, 1); } while (0)
+
+/* Exit must make sure all critical regions stores are visible before it signal the end of the region. 
+ * We don't need to emit a full barrier since we
+ */
+#define EXIT_CRITICAL_REGION  do { mono_atomic_store_seq (&IN_CRITICAL_REGION, 0); } while (0)
+
+
+#endif
 
 /*
  * FIXME: What is faster, a TLS variable pointing to a structure, or separate TLS 
@@ -725,14 +691,6 @@ static __thread long *store_remset_buffer_index_addr;
  */
 static guint32 tlab_size = (1024 * 4);
 
-/*How much space is tolerable to be wasted from the current fragment when allocating a new TLAB*/
-#define MAX_NURSERY_TLAB_WASTE 512
-
-/* fragments that are free and ready to be used for allocation */
-static Fragment *nursery_fragments = NULL;
-/* freeelist of fragment structures */
-static Fragment *fragment_freelist = NULL;
-
 #define MAX_SMALL_OBJ_SIZE     SGEN_MAX_SMALL_OBJ_SIZE
 
 /* Functions supplied by the runtime to be called by the GC */
@@ -818,9 +776,6 @@ align_pointer (void *ptr)
 
 typedef SgenGrayQueue GrayQueue;
 
-typedef void (*CopyOrMarkObjectFunc) (void**, GrayQueue*);
-typedef char* (*ScanObjectFunc) (char*, GrayQueue*);
-
 /* forward declarations */
 static int stop_world (int generation);
 static int restart_world (int generation);
@@ -834,12 +789,11 @@ static void report_registered_roots (void);
 static void find_pinning_ref_from_thread (char *obj, size_t size);
 static void update_current_thread_stack (void *start);
 static void finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue);
-static void add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation);
+static void process_fin_stage_entries (void);
 static void null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue);
 static void null_links_for_domain (MonoDomain *domain, int generation);
-static gboolean alloc_fragment_for_size (size_t size);
-static int alloc_fragment_for_size_range (size_t desired_size, size_t minimum_size);
-static void clear_nursery_fragments (char *next);
+static void process_dislink_stage_entries (void);
+
 static void pin_from_roots (void *start_nursery, void *end_nursery, GrayQueue *queue);
 static int pin_objects_from_addresses (GCMemSection *section, void **start, void **end, void *start_nursery, void *end_nursery, GrayQueue *queue);
 static void optimize_pin_queue (int start_slot);
@@ -851,7 +805,10 @@ static void finish_gray_stack (char *start_addr, char *end_addr, int generation,
 static gboolean need_major_collection (mword space_needed);
 static void major_collection (const char *reason);
 
-static void mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track);
+static gboolean collection_is_parallel (void);
+
+static void mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track, gboolean in_gc);
+static gboolean mono_gc_is_critical_method (MonoMethod *method);
 
 void describe_ptr (char *ptr);
 void check_object (char *start);
@@ -949,9 +906,9 @@ alloc_complex_descriptor (gsize *bitmap, int numbits)
 }
 
 gsize*
-mono_sgen_get_complex_descriptor (GCVTable *vt)
+mono_sgen_get_complex_descriptor (mword desc)
 {
-       return complex_descriptors + (vt->desc >> LOW_TYPE_BITS);
+       return complex_descriptors + (desc >> LOW_TYPE_BITS);
 }
 
 /*
@@ -1003,12 +960,6 @@ mono_gc_make_descr_for_object (gsize *bitmap, int numbits, size_t obj_size)
                        DEBUG (6, fprintf (gc_debug_file, "Runlen descriptor %p, size: %zd, first set: %d, num set: %d\n", (void*)desc, stored_size, first_set, num_set));
                        return (void*) desc;
                }
-               /* we know the 2-word header is ptr-free */
-               if (last_set < SMALL_BITMAP_SIZE + OBJECT_HEADER_WORDS) {
-                       desc = DESC_TYPE_SMALL_BITMAP | (stored_size << 1) | ((*bitmap >> OBJECT_HEADER_WORDS) << SMALL_BITMAP_SHIFT);
-                       DEBUG (6, fprintf (gc_debug_file, "Smallbitmap descriptor %p, size: %zd, last set: %d\n", (void*)desc, stored_size, last_set));
-                       return (void*) desc;
-               }
        }
        /* we know the 2-word header is ptr-free */
        if (last_set < LARGE_BITMAP_SIZE + OBJECT_HEADER_WORDS) {
@@ -1081,14 +1032,18 @@ mono_gc_get_bitmap_for_descr (void *descr, int *numbits)
 
                return bitmap;
        }
-       case DESC_TYPE_SMALL_BITMAP:
-               bitmap = g_new0 (gsize, 1);
+       case DESC_TYPE_LARGE_BITMAP: {
+               gsize bmap = (d >> LOW_TYPE_BITS) << OBJECT_HEADER_WORDS;
 
-               bitmap [0] = (d >> SMALL_BITMAP_SHIFT) << OBJECT_HEADER_WORDS;
-
-           *numbits = GC_BITS_PER_WORD;
-               
+               bitmap = g_new0 (gsize, 1);
+               bitmap [0] = bmap;
+               *numbits = 0;
+               while (bmap) {
+                       (*numbits) ++;
+                       bmap >>= 1;
+               }
                return bitmap;
+       }
        default:
                g_assert_not_reached ();
        }
@@ -1363,30 +1318,6 @@ mono_gc_scan_for_specific_ref (MonoObject *key, gboolean precise)
        }
 }
 
-static void
-clear_current_nursery_fragment (char *next)
-{
-       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
-               g_assert (next <= nursery_frag_real_end);
-               DEBUG (4, fprintf (gc_debug_file, "Clear nursery frag %p-%p\n", next, nursery_frag_real_end));
-               memset (next, 0, nursery_frag_real_end - next);
-       }
-}
-
-/* Clear all remaining nursery fragments */
-static void
-clear_nursery_fragments (char *next)
-{
-       Fragment *frag;
-       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
-               clear_current_nursery_fragment (next);
-               for (frag = nursery_fragments; frag; frag = frag->next) {
-                       DEBUG (4, fprintf (gc_debug_file, "Clear nursery frag %p-%p\n", frag->fragment_start, frag->fragment_end));
-                       memset (frag->fragment_start, 0, frag->fragment_end - frag->fragment_start);
-               }
-       }
-}
-
 static gboolean
 need_remove_object_for_domain (char *start, MonoDomain *domain)
 {
@@ -1511,7 +1442,7 @@ clear_domain_process_object (char *obj, MonoDomain *domain)
        if (remove && ((MonoObject*)obj)->synchronisation) {
                void **dislink = mono_monitor_get_object_monitor_weak_link ((MonoObject*)obj);
                if (dislink)
-                       mono_gc_register_disappearing_link (NULL, dislink, FALSE);
+                       mono_gc_register_disappearing_link (NULL, dislink, FALSE, TRUE);
        }
 
        return remove;
@@ -1561,7 +1492,10 @@ mono_gc_clear_domain (MonoDomain * domain)
 
        LOCK_GC;
 
-       clear_nursery_fragments (nursery_next);
+       process_fin_stage_entries ();
+       process_dislink_stage_entries ();
+
+       mono_sgen_clear_nursery_fragments ();
 
        if (xdomain_checks && domain != mono_get_root_domain ()) {
                scan_for_registered_roots_in_domain (domain, ROOT_TYPE_NORMAL);
@@ -1665,10 +1599,10 @@ global_remset_location_was_not_added (gpointer ptr)
  * lock must be held.  For serial collectors that is not necessary.
  */
 void
-mono_sgen_add_to_global_remset (SgenInternalAllocator *alc, gpointer ptr)
+mono_sgen_add_to_global_remset (gpointer ptr)
 {
        RememberedSet *rs;
-       gboolean lock = major_collector.is_parallel;
+       gboolean lock = collection_is_parallel ();
 
        if (use_cardtable) {
                sgen_card_table_mark_address ((mword)ptr);
@@ -1696,7 +1630,7 @@ mono_sgen_add_to_global_remset (SgenInternalAllocator *alc, gpointer ptr)
                *(global_remset->store_next++) = (mword)ptr;
                goto done;
        }
-       rs = alloc_global_remset (alc, global_remset->end_set - global_remset->data, NULL);
+       rs = alloc_remset (global_remset->end_set - global_remset->data, NULL, TRUE);
        rs->next = global_remset;
        global_remset = rs;
        *(global_remset->store_next++) = (mword)ptr;
@@ -1728,17 +1662,19 @@ drain_gray_stack (GrayQueue *queue, int max_objs)
        char *obj;
 
        if (current_collection_generation == GENERATION_NURSERY) {
+               ScanObjectFunc scan_func = mono_sgen_get_minor_scan_object ();
+
                for (;;) {
                        GRAY_OBJECT_DEQUEUE (queue, obj);
                        if (!obj)
                                return TRUE;
                        DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
-                       major_collector.minor_scan_object (obj, queue);
+                       scan_func (obj, queue);
                }
        } else {
                int i;
 
-               if (major_collector.is_parallel && queue == &workers_distribute_gray_queue)
+               if (collection_is_parallel () && queue == &workers_distribute_gray_queue)
                        return TRUE;
 
                do {
@@ -1772,30 +1708,8 @@ pin_objects_from_addresses (GCMemSection *section, void **start, void **end, voi
        void *addr;
        int idx;
        void **definitely_pinned = start;
-       Fragment *frag;
 
-       /*
-        * The code below starts the search from an entry in scan_starts, which might point into a nursery
-        * fragment containing random data. Clearing the nursery fragments takes a lot of time, and searching
-        * though them too, so lay arrays at each location inside a fragment where a search can start:
-        * - scan_locations[i]
-        * - start_nursery
-        * - the start of each fragment (the last_obj + last_obj case)
-        * The third encompasses the first two, since scan_locations [i] can't point inside a nursery fragment.
-        */
-       for (frag = nursery_fragments; frag; frag = frag->next) {
-               MonoArray *o;
-
-               g_assert (frag->fragment_end - frag->fragment_start >= sizeof (MonoArray));
-               o = (MonoArray*)frag->fragment_start;
-               memset (o, 0, sizeof (MonoArray));
-               g_assert (array_fill_vtable);
-               o->obj.vtable = array_fill_vtable;
-               /* Mark this as not a real object */
-               o->obj.synchronisation = GINT_TO_POINTER (-1);
-               o->max_length = (frag->fragment_end - frag->fragment_start) - sizeof (MonoArray);
-               g_assert (frag->fragment_start + safe_object_get_size ((MonoObject*)o) == frag->fragment_end);
-       }
+       mono_sgen_nursery_allocator_prepare_for_pinning ();
 
        while (start < end) {
                addr = *start;
@@ -1899,7 +1813,7 @@ mono_sgen_pin_objects_in_section (GCMemSection *section, GrayQueue *queue)
 void
 mono_sgen_pin_object (void *object, GrayQueue *queue)
 {
-       if (major_collector.is_parallel) {
+       if (collection_is_parallel ()) {
                LOCK_PIN_QUEUE;
                /*object arrives pinned*/
                pin_stage_ptr (object);
@@ -2216,33 +2130,6 @@ mono_sgen_update_heap_boundaries (mword low, mword high)
        } while (SGEN_CAS_PTR ((gpointer*)&highest_heap_address, (gpointer)high, (gpointer)old) != (gpointer)old);
 }
 
-static Fragment*
-alloc_fragment (void)
-{
-       Fragment *frag = fragment_freelist;
-       if (frag) {
-               fragment_freelist = frag->next;
-               frag->next = NULL;
-               return frag;
-       }
-       frag = mono_sgen_alloc_internal (INTERNAL_MEM_FRAGMENT);
-       frag->next = NULL;
-       return frag;
-}
-static void
-add_fragment (char *start, char *end)
-{
-       Fragment *fragment;
-
-       fragment = alloc_fragment ();
-       fragment->fragment_start = start;
-       fragment->fragment_limit = start;
-       fragment->fragment_end = end;
-       fragment->next = nursery_fragments;
-       nursery_fragments = fragment;
-}
-
 /* size must be a power of 2 */
 void*
 mono_sgen_alloc_os_memory_aligned (mword size, mword alignment, gboolean activate)
@@ -2308,8 +2195,7 @@ alloc_nursery (void)
 
        nursery_section = section;
 
-       /* Setup the single first large fragment */
-       add_fragment (nursery_start, nursery_end);
+       mono_sgen_nursery_allocator_set_nursery_bounds (nursery_start, nursery_end);
 }
 
 void*
@@ -2449,32 +2335,6 @@ scan_finalizer_entries (CopyOrMarkObjectFunc copy_func, FinalizeEntry *list, Gra
        }
 }
 
-static mword fragment_total = 0;
-/*
- * We found a fragment of free memory in the nursery: memzero it and if
- * it is big enough, add it to the list of fragments that can be used for
- * allocation.
- */
-static void
-add_nursery_frag (size_t frag_size, char* frag_start, char* frag_end)
-{
-       DEBUG (4, fprintf (gc_debug_file, "Found empty fragment: %p-%p, size: %zd\n", frag_start, frag_end, frag_size));
-       binary_protocol_empty (frag_start, frag_size);
-       /* Not worth dealing with smaller fragments: need to tune */
-       if (frag_size >= FRAGMENT_MIN_SIZE) {
-               /* memsetting just the first chunk start is bound to provide better cache locality */
-               if (nursery_clear_policy == CLEAR_AT_GC)
-                       memset (frag_start, 0, frag_size);
-
-               add_fragment (frag_start, frag_end);
-               fragment_total += frag_size;
-       } else {
-               /* Clear unused fragments, pinning depends on this */
-               /*TODO place an int[] here instead of the memset if size justify it*/
-               memset (frag_start, 0, frag_size);
-       }
-}
-
 static const char*
 generation_name (int generation)
 {
@@ -2485,26 +2345,6 @@ generation_name (int generation)
        }
 }
 
-static DisappearingLinkHashTable*
-get_dislink_hash_table (int generation)
-{
-       switch (generation) {
-       case GENERATION_NURSERY: return &minor_disappearing_link_hash;
-       case GENERATION_OLD: return &major_disappearing_link_hash;
-       default: g_assert_not_reached ();
-       }
-}
-
-static FinalizeEntryHashTable*
-get_finalize_entry_hash_table (int generation)
-{
-       switch (generation) {
-       case GENERATION_NURSERY: return &minor_finalizable_hash;
-       case GENERATION_OLD: return &major_finalizable_hash;
-       default: g_assert_not_reached ();
-       }
-}
-
 static MonoObject **finalized_array = NULL;
 static int finalized_array_capacity = 0;
 static int finalized_array_entries = 0;
@@ -2527,6 +2367,53 @@ bridge_register_finalized_object (MonoObject *object)
        finalized_array [finalized_array_entries++] = object;
 }
 
+static void
+bridge_process (void)
+{
+       if (finalized_array_entries <= 0)
+               return;
+
+       g_assert (mono_sgen_need_bridge_processing ());
+       mono_sgen_bridge_processing (finalized_array_entries, finalized_array);
+
+       finalized_array_entries = 0;
+}
+
+CopyOrMarkObjectFunc
+mono_sgen_get_copy_object (void)
+{
+       if (current_collection_generation == GENERATION_NURSERY) {
+               if (collection_is_parallel ())
+                       return major_collector.copy_object;
+               else
+                       return major_collector.nopar_copy_object;
+       } else {
+               return major_collector.copy_or_mark_object;
+       }
+}
+
+ScanObjectFunc
+mono_sgen_get_minor_scan_object (void)
+{
+       g_assert (current_collection_generation == GENERATION_NURSERY);
+
+       if (collection_is_parallel ())
+               return major_collector.minor_scan_object;
+       else
+               return major_collector.nopar_minor_scan_object;
+}
+
+ScanVTypeFunc
+mono_sgen_get_minor_scan_vtype (void)
+{
+       g_assert (current_collection_generation == GENERATION_NURSERY);
+
+       if (collection_is_parallel ())
+               return major_collector.minor_scan_vtype;
+       else
+               return major_collector.nopar_minor_scan_vtype;
+}
+
 static void
 finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *queue)
 {
@@ -2535,7 +2422,7 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
        int fin_ready;
        int ephemeron_rounds = 0;
        int num_loops;
-       CopyOrMarkObjectFunc copy_func = current_collection_generation == GENERATION_NURSERY ? major_collector.copy_object : major_collector.copy_or_mark_object;
+       CopyOrMarkObjectFunc copy_func = mono_sgen_get_copy_object ();
 
        /*
         * We copied all the reachable objects. Now it's the time to copy
@@ -2596,11 +2483,8 @@ finish_gray_stack (char *start_addr, char *end_addr, int generation, GrayQueue *
                if (generation == GENERATION_OLD)
                        finalize_in_range (copy_func, nursery_start, nursery_end, GENERATION_NURSERY, queue);
 
-               if (fin_ready != num_ready_finalizers) {
+               if (fin_ready != num_ready_finalizers)
                        ++num_loops;
-                       if (finalized_array != NULL)
-                               mono_sgen_bridge_processing (finalized_array_entries, finalized_array);
-               }
 
                /* drain the new stack that might have been created */
                DEBUG (6, fprintf (gc_debug_file, "Precise scan of gray area post fin\n"));
@@ -2663,53 +2547,6 @@ check_scan_starts (void)
 
 static int last_num_pinned = 0;
 
-static void
-build_nursery_fragments (void **start, int num_entries)
-{
-       char *frag_start, *frag_end;
-       size_t frag_size;
-       int i;
-
-       while (nursery_fragments) {
-               Fragment *next = nursery_fragments->next;
-               nursery_fragments->next = fragment_freelist;
-               fragment_freelist = nursery_fragments;
-               nursery_fragments = next;
-       }
-       frag_start = nursery_start;
-       fragment_total = 0;
-       /* clear scan starts */
-       memset (nursery_section->scan_starts, 0, nursery_section->num_scan_start * sizeof (gpointer));
-       for (i = 0; i < num_entries; ++i) {
-               frag_end = start [i];
-               /* remove the pin bit from pinned objects */
-               unpin_object (frag_end);
-               nursery_section->scan_starts [((char*)frag_end - (char*)nursery_section->data)/SCAN_START_SIZE] = frag_end;
-               frag_size = frag_end - frag_start;
-               if (frag_size)
-                       add_nursery_frag (frag_size, frag_start, frag_end);
-               frag_size = ALIGN_UP (safe_object_get_size ((MonoObject*)start [i]));
-               frag_start = (char*)start [i] + frag_size;
-       }
-       nursery_last_pinned_end = frag_start;
-       frag_end = nursery_end;
-       frag_size = frag_end - frag_start;
-       if (frag_size)
-               add_nursery_frag (frag_size, frag_start, frag_end);
-       if (!nursery_fragments) {
-               DEBUG (1, fprintf (gc_debug_file, "Nursery fully pinned (%d)\n", num_entries));
-               for (i = 0; i < num_entries; ++i) {
-                       DEBUG (3, fprintf (gc_debug_file, "Bastard pinning obj %p (%s), size: %d\n", start [i], safe_name (start [i]), safe_object_get_size (start [i])));
-               }
-               degraded_mode = 1;
-       }
-
-       nursery_next = nursery_frag_real_end = NULL;
-
-       /* Clear TLABs for all threads */
-       clear_tlabs ();
-}
-
 static void
 scan_from_registered_roots (CopyOrMarkObjectFunc copy_func, char *addr_start, char *addr_end, int root_type, GrayQueue *queue)
 {
@@ -2854,7 +2691,7 @@ mono_sgen_register_moved_object (void *obj, void *destination)
        g_assert (mono_profiler_events & MONO_PROFILE_GC_MOVES);
 
        /* FIXME: handle this for parallel collector */
-       g_assert (!major_collector.is_parallel);
+       g_assert (!collection_is_parallel ());
 
        if (moved_objects_idx == MOVED_OBJECTS_NUM) {
                mono_profiler_gc_moves (moved_objects, moved_objects_idx);
@@ -2926,8 +2763,7 @@ init_stats (void)
        mono_counters_register ("# nursery copy_object() failed forwarded", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_nursery_copy_object_failed_forwarded);
        mono_counters_register ("# nursery copy_object() failed pinned", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_nursery_copy_object_failed_pinned);
 
-       mono_counters_register ("# wasted fragments used", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_wasted_fragments_used);
-       mono_counters_register ("bytes in wasted fragments", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_wasted_fragments_bytes);
+       mono_sgen_nursery_allocator_init_heavy_stats ();
 
        mono_counters_register ("Store remsets", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_store_remsets);
        mono_counters_register ("Unique store remsets", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_store_remsets_unique);
@@ -3000,6 +2836,17 @@ try_calculate_minor_collection_allowance (gboolean overwrite)
 
        minor_collection_allowance = MAX (MIN (allowance_target, num_major_sections * major_collector.section_size + los_memory_usage), MIN_MINOR_COLLECTION_ALLOWANCE);
 
+       if (debug_print_allowance) {
+               mword old_major = last_collection_old_num_major_sections * major_collector.section_size;
+               mword new_major = num_major_sections * major_collector.section_size;
+
+               fprintf (gc_debug_file, "Before collection: %ld bytes (%ld major, %ld LOS)\n",
+                               old_major + last_collection_old_los_memory_usage, old_major, last_collection_old_los_memory_usage);
+               fprintf (gc_debug_file, "After collection: %ld bytes (%ld major, %ld LOS)\n",
+                               new_major + last_collection_los_memory_usage, new_major, last_collection_los_memory_usage);
+               fprintf (gc_debug_file, "Allowance: %ld bytes\n", minor_collection_allowance);
+       }
+
        if (major_collector.have_computed_minor_collection_allowance)
                major_collector.have_computed_minor_collection_allowance ();
 
@@ -3020,6 +2867,25 @@ mono_sgen_need_major_collection (mword space_needed)
        return need_major_collection (space_needed);
 }
 
+static gboolean
+collection_is_parallel (void)
+{
+       switch (current_collection_generation) {
+       case GENERATION_NURSERY:
+               return nursery_collection_is_parallel;
+       case GENERATION_OLD:
+               return major_collector.is_parallel;
+       default:
+               g_assert_not_reached ();
+       }
+}
+
+gboolean
+mono_sgen_nursery_collection_is_parallel (void)
+{
+       return nursery_collection_is_parallel;
+}
+
 static GrayQueue*
 job_gray_queue (WorkerData *worker_data)
 {
@@ -3083,10 +2949,11 @@ collect_nursery (size_t requested_size)
 {
        gboolean needs_major;
        size_t max_garbage_amount;
-       char *orig_nursery_next;
+       char *nursery_next;
        ScanFromRemsetsJobData sfrjd;
        ScanFromRegisteredRootsJobData scrrjd_normal, scrrjd_wbarrier;
        ScanThreadDataJobData stdjd;
+       mword fragment_total;
        TV_DECLARE (all_atv);
        TV_DECLARE (all_btv);
        TV_DECLARE (atv);
@@ -3104,11 +2971,12 @@ collect_nursery (size_t requested_size)
 
        degraded_mode = 0;
        objects_pinned = 0;
-       orig_nursery_next = nursery_next;
-       nursery_next = MAX (nursery_next, nursery_last_pinned_end);
+       nursery_next = mono_sgen_nursery_alloc_get_upper_alloc_bound ();
        /* FIXME: optimize later to use the higher address where an object can be present */
        nursery_next = MAX (nursery_next, nursery_end);
 
+       nursery_alloc_bound = nursery_next;
+
        DEBUG (1, fprintf (gc_debug_file, "Start nursery collection %d %p-%p, size: %d\n", num_minor_gcs, nursery_start, nursery_next, (int)(nursery_next - nursery_start)));
        max_garbage_amount = nursery_next - nursery_start;
        g_assert (nursery_section->size >= max_garbage_amount);
@@ -3118,7 +2986,7 @@ collect_nursery (size_t requested_size)
        atv = all_atv;
 
        /* Pinning no longer depends on clearing all nursery fragments */
-       clear_current_nursery_fragment (orig_nursery_next);
+       mono_sgen_clear_current_nursery_fragment ();
 
        TV_GETTIME (btv);
        time_minor_pre_collection_fragment_clear += TV_ELAPSED_MS (atv, btv);
@@ -3132,7 +3000,7 @@ collect_nursery (size_t requested_size)
 
        try_calculate_minor_collection_allowance (FALSE);
 
-       gray_object_queue_init (&gray_queue, mono_sgen_get_unmanaged_allocator ());
+       gray_object_queue_init (&gray_queue);
        workers_init_distribute_gray_queue ();
 
        num_minor_gcs++;
@@ -3140,6 +3008,9 @@ collect_nursery (size_t requested_size)
 
        global_remset_cache_clear ();
 
+       process_fin_stage_entries ();
+       process_dislink_stage_entries ();
+
        /* pin from pinned handles */
        init_pinning ();
        mono_profiler_gc_event (MONO_GC_EVENT_MARK_START, 0);
@@ -3172,7 +3043,7 @@ collect_nursery (size_t requested_size)
 
        sfrjd.heap_start = nursery_start;
        sfrjd.heap_end = nursery_next;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_remsets, &sfrjd);
+       workers_enqueue_job (job_scan_from_remsets, &sfrjd);
 
        /* we don't have complete write barrier yet, so we scan all the old generation sections */
        TV_GETTIME (btv);
@@ -3187,7 +3058,7 @@ collect_nursery (size_t requested_size)
                time_minor_scan_card_table += TV_ELAPSED_MS (atv, btv);
        }
 
-       if (!major_collector.is_parallel)
+       if (!collection_is_parallel ())
                drain_gray_stack (&gray_queue, -1);
 
        if (mono_profiler_get_events () & MONO_PROFILE_GC_ROOTS)
@@ -3198,17 +3069,17 @@ collect_nursery (size_t requested_size)
        time_minor_scan_pinned += TV_ELAPSED_MS (btv, atv);
 
        /* registered roots, this includes static fields */
-       scrrjd_normal.func = major_collector.copy_object;
+       scrrjd_normal.func = collection_is_parallel () ? major_collector.copy_object : major_collector.nopar_copy_object;
        scrrjd_normal.heap_start = nursery_start;
        scrrjd_normal.heap_end = nursery_next;
        scrrjd_normal.root_type = ROOT_TYPE_NORMAL;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_normal);
+       workers_enqueue_job (job_scan_from_registered_roots, &scrrjd_normal);
 
-       scrrjd_wbarrier.func = major_collector.copy_object;
+       scrrjd_wbarrier.func = collection_is_parallel () ? major_collector.copy_object : major_collector.nopar_copy_object;
        scrrjd_wbarrier.heap_start = nursery_start;
        scrrjd_wbarrier.heap_end = nursery_next;
        scrrjd_wbarrier.root_type = ROOT_TYPE_WBARRIER;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_wbarrier);
+       workers_enqueue_job (job_scan_from_registered_roots, &scrrjd_wbarrier);
 
        TV_GETTIME (btv);
        time_minor_scan_registered_roots += TV_ELAPSED_MS (atv, btv);
@@ -3216,13 +3087,13 @@ collect_nursery (size_t requested_size)
        /* thread data */
        stdjd.heap_start = nursery_start;
        stdjd.heap_end = nursery_next;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_thread_data, &stdjd);
+       workers_enqueue_job (job_scan_thread_data, &stdjd);
 
        TV_GETTIME (atv);
        time_minor_scan_thread_data += TV_ELAPSED_MS (btv, atv);
        btv = atv;
 
-       if (major_collector.is_parallel) {
+       if (collection_is_parallel ()) {
                while (!gray_object_queue_is_empty (WORKERS_DISTRIBUTE_GRAY_QUEUE)) {
                        workers_distribute_gray_queue_sections ();
                        usleep (1000);
@@ -3230,7 +3101,7 @@ collect_nursery (size_t requested_size)
        }
        workers_join ();
 
-       if (major_collector.is_parallel)
+       if (collection_is_parallel ())
                g_assert (gray_object_queue_is_empty (&gray_queue));
 
        finish_gray_stack (nursery_start, nursery_next, GENERATION_NURSERY, &gray_queue);
@@ -3259,7 +3130,13 @@ collect_nursery (size_t requested_size)
         * next allocations.
         */
        mono_profiler_gc_event (MONO_GC_EVENT_RECLAIM_START, 0);
-       build_nursery_fragments (pin_queue, next_pin_slot);
+       fragment_total = mono_sgen_build_nursery_fragments (nursery_section, pin_queue, next_pin_slot);
+       if (!fragment_total)
+               degraded_mode = 1;
+
+       /* Clear TLABs for all threads */
+       clear_tlabs ();
+
        mono_profiler_gc_event (MONO_GC_EVENT_RECLAIM_END, 0);
        TV_GETTIME (btv);
        time_minor_fragment_creation += TV_ELAPSED_MS (atv, btv);
@@ -3352,7 +3229,7 @@ major_do_collection (const char *reason)
 
        binary_protocol_collection (GENERATION_OLD);
        check_scan_starts ();
-       gray_object_queue_init (&gray_queue, mono_sgen_get_unmanaged_allocator ());
+       gray_object_queue_init (&gray_queue);
        workers_init_distribute_gray_queue ();
 
        degraded_mode = 0;
@@ -3365,7 +3242,7 @@ major_do_collection (const char *reason)
        atv = all_atv;
 
        /* Pinning depends on this */
-       clear_nursery_fragments (nursery_next);
+       mono_sgen_clear_nursery_fragments ();
 
        TV_GETTIME (btv);
        time_major_pre_collection_fragment_clear += TV_ELAPSED_MS (atv, btv);
@@ -3390,6 +3267,9 @@ major_do_collection (const char *reason)
        if (use_cardtable)
                card_table_clear ();
 
+       process_fin_stage_entries ();
+       process_dislink_stage_entries ();
+
        TV_GETTIME (atv);
        init_pinning ();
        DEBUG (6, fprintf (gc_debug_file, "Collecting pinned addresses\n"));
@@ -3454,13 +3334,13 @@ major_do_collection (const char *reason)
        scrrjd_normal.heap_start = heap_start;
        scrrjd_normal.heap_end = heap_end;
        scrrjd_normal.root_type = ROOT_TYPE_NORMAL;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_normal);
+       workers_enqueue_job (job_scan_from_registered_roots, &scrrjd_normal);
 
        scrrjd_wbarrier.func = major_collector.copy_or_mark_object;
        scrrjd_wbarrier.heap_start = heap_start;
        scrrjd_wbarrier.heap_end = heap_end;
        scrrjd_wbarrier.root_type = ROOT_TYPE_WBARRIER;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_from_registered_roots, &scrrjd_wbarrier);
+       workers_enqueue_job (job_scan_from_registered_roots, &scrrjd_wbarrier);
 
        TV_GETTIME (btv);
        time_major_scan_registered_roots += TV_ELAPSED_MS (atv, btv);
@@ -3468,7 +3348,7 @@ major_do_collection (const char *reason)
        /* Threads */
        stdjd.heap_start = heap_start;
        stdjd.heap_end = heap_end;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_thread_data, &stdjd);
+       workers_enqueue_job (job_scan_thread_data, &stdjd);
 
        TV_GETTIME (atv);
        time_major_scan_thread_data += TV_ELAPSED_MS (btv, atv);
@@ -3481,10 +3361,10 @@ major_do_collection (const char *reason)
 
        /* scan the list of objects ready for finalization */
        sfejd_fin_ready.list = fin_ready_list;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_finalizer_entries, &sfejd_fin_ready);
+       workers_enqueue_job (job_scan_finalizer_entries, &sfejd_fin_ready);
 
        sfejd_critical_fin.list = critical_fin_list;
-       workers_enqueue_job (workers_distribute_gray_queue.allocator, job_scan_finalizer_entries, &sfejd_critical_fin);
+       workers_enqueue_job (job_scan_finalizer_entries, &sfejd_critical_fin);
 
        TV_GETTIME (atv);
        time_major_scan_finalized += TV_ELAPSED_MS (btv, atv);
@@ -3573,7 +3453,11 @@ major_do_collection (const char *reason)
         * pinned objects as we go, memzero() the empty fragments so they are ready for the
         * next allocations.
         */
-       build_nursery_fragments (nursery_section->pin_queue_start, nursery_section->pin_queue_num_entries);
+       if (!mono_sgen_build_nursery_fragments (nursery_section, nursery_section->pin_queue_start, nursery_section->pin_queue_num_entries))
+               degraded_mode = 1;
+
+       /* Clear TLABs for all threads */
+       clear_tlabs ();
 
        TV_GETTIME (atv);
        time_major_fragment_creation += TV_ELAPSED_MS (btv, atv);
@@ -3655,7 +3539,7 @@ minor_collect_or_expand_inner (size_t size)
                DEBUG (2, fprintf (gc_debug_file, "Heap size: %lu, LOS size: %lu\n", (unsigned long)total_alloc, (unsigned long)los_memory_usage));
                restart_world (0);
                /* this also sets the proper pointers for the next allocation */
-               if (!alloc_fragment_for_size (size)) {
+               if (!mono_sgen_can_alloc_size (size)) {
                        int i;
                        /* TypeBuilder and MonoMethod are killing mcs with fragmentation */
                        DEBUG (1, fprintf (gc_debug_file, "nursery collection didn't find enough room for %zd alloc (%d pinned)\n", size, last_num_pinned));
@@ -3735,100 +3619,23 @@ mono_sgen_free_os_memory (void *addr, size_t size)
  * *) allocation of pinned objects
  */
 
-static void
-setup_fragment (Fragment *frag, Fragment *prev, size_t size)
-{
-       /* remove from the list */
-       if (prev)
-               prev->next = frag->next;
-       else
-               nursery_fragments = frag->next;
-       nursery_next = frag->fragment_start;
-       nursery_frag_real_end = frag->fragment_end;
-
-       DEBUG (4, fprintf (gc_debug_file, "Using nursery fragment %p-%p, size: %td (req: %zd)\n", nursery_next, nursery_frag_real_end, nursery_frag_real_end - nursery_next, size));
-       frag->next = fragment_freelist;
-       fragment_freelist = frag;
-}
-
-/*
- * Allocate a new nursery fragment able to hold an object of size @size.
- * nursery_next and nursery_frag_real_end are set to the boundaries of the fragment.
- * Return TRUE if found, FALSE otherwise.
- */
-static gboolean
-alloc_fragment_for_size (size_t size)
-{
-       Fragment *frag, *prev;
-       DEBUG (4, fprintf (gc_debug_file, "Searching nursery fragment %p, size: %zd\n", nursery_frag_real_end, size));
-
-       if (nursery_frag_real_end > nursery_next && nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
-               /* Clear the remaining space, pinning depends on this */
-               memset (nursery_next, 0, nursery_frag_real_end - nursery_next);
-       }
-
-       prev = NULL;
-       for (frag = nursery_fragments; frag; frag = frag->next) {
-               if (size <= (frag->fragment_end - frag->fragment_start)) {
-                       setup_fragment (frag, prev, size);
-                       return TRUE;
-               }
-               prev = frag;
-       }
-       return FALSE;
-}
-
-/*
- * Same as alloc_fragment_for_size but if search for @desired_size fails, try to satisfy @minimum_size.
- * This improves nursery usage.
- */
-static int
-alloc_fragment_for_size_range (size_t desired_size, size_t minimum_size)
-{
-       Fragment *frag, *prev, *min_prev;
-       DEBUG (4, fprintf (gc_debug_file, "Searching nursery fragment %p, desired size: %zd minimum size %zd\n", nursery_frag_real_end, desired_size, minimum_size));
-
-       if (nursery_frag_real_end > nursery_next && nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
-               /* Clear the remaining space, pinning depends on this */
-               memset (nursery_next, 0, nursery_frag_real_end - nursery_next);
-       }
-
-       min_prev = GINT_TO_POINTER (-1);
-       prev = NULL;
-
-       for (frag = nursery_fragments; frag; frag = frag->next) {
-               int frag_size = frag->fragment_end - frag->fragment_start;
-               if (desired_size <= frag_size) {
-                       setup_fragment (frag, prev, desired_size);
-                       return desired_size;
+static void*
+alloc_degraded (MonoVTable *vtable, size_t size, gboolean for_mature)
+{
+       static int last_major_gc_warned = -1;
+       static int num_degraded = 0;
+
+       if (!for_mature) {
+               if (last_major_gc_warned < num_major_gcs) {
+                       ++num_degraded;
+                       if (num_degraded == 1 || num_degraded == 3)
+                               fprintf (stderr, "Warning: Degraded allocation.  Consider increasing nursery-size if the warning persists.\n");
+                       else if (num_degraded == 10)
+                               fprintf (stderr, "Warning: Repeated degraded allocation.  Consider increasing nursery-size.\n");
+                       last_major_gc_warned = num_major_gcs;
                }
-               if (minimum_size <= frag_size)
-                       min_prev = prev;
-
-               prev = frag;
-       }
-
-       if (min_prev != GINT_TO_POINTER (-1)) {
-               int frag_size;
-               if (min_prev)
-                       frag = min_prev->next;
-               else
-                       frag = nursery_fragments;
-
-               frag_size = frag->fragment_end - frag->fragment_start;
-               HEAVY_STAT (++stat_wasted_fragments_used);
-               HEAVY_STAT (stat_wasted_fragments_bytes += frag_size);
-
-               setup_fragment (frag, min_prev, minimum_size);
-               return frag_size;
        }
 
-       return 0;
-}
-
-static void*
-alloc_degraded (MonoVTable *vtable, size_t size)
-{
        if (need_major_collection (0)) {
                mono_profiler_gc_event (MONO_GC_EVENT_START, 1);
                stop_world (1);
@@ -3876,7 +3683,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                        collect_nursery (0);
                        restart_world (0);
                        mono_profiler_gc_event (MONO_GC_EVENT_END, 0);
-                       if (!degraded_mode && !alloc_fragment_for_size (size) && size <= MAX_SMALL_OBJ_SIZE) {
+                       if (!degraded_mode && !mono_sgen_can_alloc_size (size) && size <= MAX_SMALL_OBJ_SIZE) {
                                // FIXME:
                                g_assert_not_reached ();
                        }
@@ -3915,9 +3722,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                        DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
                        binary_protocol_alloc (p , vtable, size);
                        g_assert (*p == NULL);
-                       *p = vtable;
-
-                       g_assert (TLAB_NEXT == new_next);
+                       mono_atomic_store_seq (p, vtable);
 
                        return p;
                }
@@ -3933,8 +3738,8 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                 * This avoids taking again the GC lock when registering, but this is moot when
                 * doing thread-local allocation, so it may not be a good idea.
                 */
-               g_assert (TLAB_NEXT == new_next);
                if (TLAB_NEXT >= TLAB_REAL_END) {
+                       int available_in_tlab;
                        /* 
                         * Run out of space in the TLAB. When this happens, some amount of space
                         * remains in the TLAB, but not enough to satisfy the current allocation
@@ -3947,28 +3752,28 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                         * for a while, to decrease the number of useless nursery collections.
                         */
                        if (degraded_mode && degraded_mode < DEFAULT_NURSERY_SIZE) {
-                               p = alloc_degraded (vtable, size);
+                               p = alloc_degraded (vtable, size, FALSE);
                                binary_protocol_alloc_degraded (p, vtable, size);
                                return p;
                        }
 
-                       /*FIXME This codepath is current deadcode since tlab_size > MAX_SMALL_OBJ_SIZE*/
-                       if (size > tlab_size) {
+                       available_in_tlab = TLAB_REAL_END - TLAB_NEXT;
+                       if (size > tlab_size || available_in_tlab > SGEN_MAX_NURSERY_WASTE) {
                                /* Allocate directly from the nursery */
-                               if (nursery_next + size >= nursery_frag_real_end) {
-                                       if (!alloc_fragment_for_size (size)) {
+                               do {
+                                       p = mono_sgen_nursery_alloc (size);
+                                       if (!p) {
                                                minor_collect_or_expand_inner (size);
                                                if (degraded_mode) {
-                                                       p = alloc_degraded (vtable, size);
+                                                       p = alloc_degraded (vtable, size, FALSE);
                                                        binary_protocol_alloc_degraded (p, vtable, size);
                                                        return p;
+                                               } else {
+                                                       p = mono_sgen_nursery_alloc (size);
                                                }
                                        }
-                               }
-
-                               p = (void*)nursery_next;
-                               nursery_next += size;
-                               if (nursery_next > nursery_frag_real_end) {
+                               } while (!p);
+                               if (!p) {
                                        // no space left
                                        g_assert (0);
                                }
@@ -3977,31 +3782,32 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                                        memset (p, 0, size);
                                }
                        } else {
-                               int alloc_size = tlab_size;
-                               int available_in_nursery = nursery_frag_real_end - nursery_next;
+                               int alloc_size = 0;
                                if (TLAB_START)
                                        DEBUG (3, fprintf (gc_debug_file, "Retire TLAB: %p-%p [%ld]\n", TLAB_START, TLAB_REAL_END, (long)(TLAB_REAL_END - TLAB_NEXT - size)));
+                               mono_sgen_nursery_retire_region (p, available_in_tlab);
 
-                               if (alloc_size >= available_in_nursery) {
-                                       if (available_in_nursery > MAX_NURSERY_TLAB_WASTE && available_in_nursery > size) {
-                                               alloc_size = available_in_nursery;
-                                       } else {
-                                               alloc_size = alloc_fragment_for_size_range (tlab_size, size);
-                                               if (!alloc_size) {
-                                                       alloc_size = tlab_size;
-                                                       minor_collect_or_expand_inner (tlab_size);
-                                                       if (degraded_mode) {
-                                                               p = alloc_degraded (vtable, size);
-                                                               binary_protocol_alloc_degraded (p, vtable, size);
-                                                               return p;
-                                                       }
-                                               }
+                               do {
+                                       p = mono_sgen_nursery_alloc_range (tlab_size, size, &alloc_size);
+                                       if (!p) {
+                                               minor_collect_or_expand_inner (tlab_size);
+                                               if (degraded_mode) {
+                                                       p = alloc_degraded (vtable, size, FALSE);
+                                                       binary_protocol_alloc_degraded (p, vtable, size);
+                                                       return p;
+                                               } else {
+                                                       p = mono_sgen_nursery_alloc_range (tlab_size, size, &alloc_size);
+                                               }               
                                        }
+                               } while (!p);
+                                       
+                               if (!p) {
+                                       // no space left
+                                       g_assert (0);
                                }
 
                                /* Allocate a new TLAB from the current nursery fragment */
-                               TLAB_START = nursery_next;
-                               nursery_next += alloc_size;
+                               TLAB_START = (char*)p;
                                TLAB_NEXT = TLAB_START;
                                TLAB_REAL_END = TLAB_START + alloc_size;
                                TLAB_TEMP_END = TLAB_START + MIN (SCAN_START_SIZE, alloc_size);
@@ -4013,7 +3819,6 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
                                /* Allocate from the TLAB */
                                p = (void*)TLAB_NEXT;
                                TLAB_NEXT += size;
-                               g_assert (TLAB_NEXT <= TLAB_REAL_END);
 
                                nursery_section->scan_starts [((char*)p - (char*)nursery_section->data)/SCAN_START_SIZE] = (char*)p;
                        }
@@ -4031,7 +3836,7 @@ mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
        if (G_LIKELY (p)) {
                DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
                binary_protocol_alloc (p, vtable, size);
-               *p = vtable;
+               mono_atomic_store_seq (p, vtable);
        }
 
        return p;
@@ -4047,36 +3852,78 @@ mono_gc_try_alloc_obj_nolock (MonoVTable *vtable, size_t size)
        size = ALIGN_UP (size);
 
        g_assert (vtable->gc_descr);
-       if (size <= MAX_SMALL_OBJ_SIZE) {
+       if (size > MAX_SMALL_OBJ_SIZE)
+               return NULL;
+
+       if (G_UNLIKELY (size > tlab_size)) {
+               /* Allocate directly from the nursery */
+               p = mono_sgen_nursery_alloc (size);
+               if (!p)
+                       return NULL;
+
+               /*FIXME we should use weak memory ops here. Should help specially on x86. */
+               if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION)
+                       memset (p, 0, size);
+       } else {
+               int available_in_tlab;
+               char *real_end;
                /* tlab_next and tlab_temp_end are TLS vars so accessing them might be expensive */
 
                p = (void**)TLAB_NEXT;
                /* FIXME: handle overflow */
                new_next = (char*)p + size;
-               TLAB_NEXT = new_next;
-
-               if (G_LIKELY (new_next < TLAB_TEMP_END)) {
-                       /* Fast path */
-
-                       /* 
-                        * FIXME: We might need a memory barrier here so the change to tlab_next is 
-                        * visible before the vtable store.
-                        */
 
-                       HEAVY_STAT (++stat_objects_alloced);
-                       HEAVY_STAT (stat_bytes_alloced += size);
+               real_end = TLAB_REAL_END;
+               available_in_tlab = real_end - (char*)p;
 
-                       DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
-                       binary_protocol_alloc (p, vtable, size);
-                       g_assert (*p == NULL);
-                       *p = vtable;
+               if (G_LIKELY (new_next < real_end)) {
+                       TLAB_NEXT = new_next;
+               } else if (available_in_tlab > SGEN_MAX_NURSERY_WASTE) {
+                       /* Allocate directly from the nursery */
+                       p = mono_sgen_nursery_alloc (size);
+                       if (!p)
+                               return NULL;
 
-                       g_assert (TLAB_NEXT == new_next);
+                       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION)
+                               memset (p, 0, size);                    
+               } else {
+                       int alloc_size = 0;
+
+                       mono_sgen_nursery_retire_region (p, available_in_tlab);
+                       new_next = mono_sgen_nursery_alloc_range (tlab_size, size, &alloc_size);
+                       p = (void**)new_next;
+                       if (!p)
+                               return NULL;
+
+                       TLAB_START = (char*)new_next;
+                       TLAB_NEXT = new_next + size;
+                       TLAB_REAL_END = new_next + alloc_size;
+                       TLAB_TEMP_END = new_next + MIN (SCAN_START_SIZE, alloc_size);
+
+                       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION)
+                               memset (new_next, 0, alloc_size);
+                       new_next += size;
+               }
 
-                       return p;
+               /* Second case, we overflowed temp end */
+               if (G_UNLIKELY (new_next >= TLAB_TEMP_END)) {
+                       nursery_section->scan_starts [((char*)p - (char*)nursery_section->data)/SCAN_START_SIZE] = (char*)p;
+                       /* we just bump tlab_temp_end as well */
+                       TLAB_TEMP_END = MIN (TLAB_REAL_END, TLAB_NEXT + SCAN_START_SIZE);
+                       DEBUG (5, fprintf (gc_debug_file, "Expanding local alloc: %p-%p\n", TLAB_NEXT, TLAB_TEMP_END));         
                }
        }
-       return NULL;
+
+       HEAVY_STAT (++stat_objects_alloced);
+       HEAVY_STAT (stat_bytes_alloced += size);
+
+       DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
+       binary_protocol_alloc (p, vtable, size);
+       g_assert (*p == NULL); /* FIXME disable this in non debug builds */
+
+       mono_atomic_store_seq (p, vtable);
+
+       return p;
 }
 
 void*
@@ -4110,6 +3957,7 @@ mono_gc_alloc_vector (MonoVTable *vtable, size_t size, uintptr_t max_length)
        ENTER_CRITICAL_REGION;
        arr = mono_gc_try_alloc_obj_nolock (vtable, size);
        if (arr) {
+               /*This doesn't require fencing since EXIT_CRITICAL_REGION already does it for us*/
                arr->max_length = max_length;
                EXIT_CRITICAL_REGION;
                return arr;
@@ -4165,6 +4013,7 @@ mono_gc_alloc_string (MonoVTable *vtable, size_t size, gint32 len)
        ENTER_CRITICAL_REGION;
        str = mono_gc_try_alloc_obj_nolock (vtable, size);
        if (str) {
+               /*This doesn't require fencing since EXIT_CRITICAL_REGION already does it for us*/
                str->length = len;
                EXIT_CRITICAL_REGION;
                return str;
@@ -4208,7 +4057,7 @@ mono_gc_alloc_pinned_obj (MonoVTable *vtable, size_t size)
        if (G_LIKELY (p)) {
                DEBUG (6, fprintf (gc_debug_file, "Allocated pinned object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
                binary_protocol_alloc_pinned (p, vtable, size);
-               *p = vtable;
+               mono_atomic_store_seq (p, vtable);
        }
        UNLOCK_GC;
        return p;
@@ -4220,8 +4069,8 @@ mono_gc_alloc_mature (MonoVTable *vtable)
        void **res;
        size_t size = ALIGN_UP (vtable->klass->instance_size);
        LOCK_GC;
-       res = alloc_degraded (vtable, size);
-       *res = vtable;
+       res = alloc_degraded (vtable, size, TRUE);
+       mono_atomic_store_seq (res, vtable);
        UNLOCK_GC;
        if (G_UNLIKELY (vtable->klass->has_finalize))
                mono_object_register_finalizer ((MonoObject*)res);
@@ -4268,117 +4117,6 @@ queue_finalization_entry (FinalizeEntry *entry) {
        }
 }
 
-/* LOCKING: requires that the GC lock is held */
-static void
-rehash_fin_table (FinalizeEntryHashTable *hash_table)
-{
-       FinalizeEntry **finalizable_hash = hash_table->table;
-       mword finalizable_hash_size = hash_table->size;
-       int i;
-       unsigned int hash;
-       FinalizeEntry **new_hash;
-       FinalizeEntry *entry, *next;
-       int new_size = g_spaced_primes_closest (hash_table->num_registered);
-
-       new_hash = mono_sgen_alloc_internal_dynamic (new_size * sizeof (FinalizeEntry*), INTERNAL_MEM_FIN_TABLE);
-       for (i = 0; i < finalizable_hash_size; ++i) {
-               for (entry = finalizable_hash [i]; entry; entry = next) {
-                       hash = mono_object_hash (entry->object) % new_size;
-                       next = entry->next;
-                       entry->next = new_hash [hash];
-                       new_hash [hash] = entry;
-               }
-       }
-       mono_sgen_free_internal_dynamic (finalizable_hash, finalizable_hash_size * sizeof (FinalizeEntry*), INTERNAL_MEM_FIN_TABLE);
-       hash_table->table = new_hash;
-       hash_table->size = new_size;
-}
-
-/* LOCKING: requires that the GC lock is held */
-static void
-rehash_fin_table_if_necessary (FinalizeEntryHashTable *hash_table)
-{
-       if (hash_table->num_registered >= hash_table->size * 2)
-               rehash_fin_table (hash_table);
-}
-
-/* LOCKING: requires that the GC lock is held */
-static void
-finalize_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, GrayQueue *queue)
-{
-       FinalizeEntryHashTable *hash_table = get_finalize_entry_hash_table (generation);
-       FinalizeEntry *entry, *prev;
-       int i;
-       FinalizeEntry **finalizable_hash = hash_table->table;
-       mword finalizable_hash_size = hash_table->size;
-
-       if (no_finalize)
-               return;
-       for (i = 0; i < finalizable_hash_size; ++i) {
-               prev = NULL;
-               for (entry = finalizable_hash [i]; entry;) {
-                       if ((char*)entry->object >= start && (char*)entry->object < end && !major_collector.is_object_live (entry->object)) {
-                               gboolean is_fin_ready = object_is_fin_ready (entry->object);
-                               char *copy = entry->object;
-                               copy_func ((void**)&copy, queue);
-                               if (is_fin_ready) {
-                                       char *from;
-                                       FinalizeEntry *next;
-                                       /* remove and put in fin_ready_list */
-                                       if (prev)
-                                               prev->next = entry->next;
-                                       else
-                                               finalizable_hash [i] = entry->next;
-                                       next = entry->next;
-                                       num_ready_finalizers++;
-                                       hash_table->num_registered--;
-                                       queue_finalization_entry (entry);
-                                       bridge_register_finalized_object ((MonoObject*)copy);
-                                       /* Make it survive */
-                                       from = entry->object;
-                                       entry->object = copy;
-                                       DEBUG (5, fprintf (gc_debug_file, "Queueing object for finalization: %p (%s) (was at %p) (%d/%d)\n", entry->object, safe_name (entry->object), from, num_ready_finalizers, hash_table->num_registered));
-                                       entry = next;
-                                       continue;
-                               } else {
-                                       char *from = entry->object;
-                                       if (hash_table == &minor_finalizable_hash && !ptr_in_nursery (copy)) {
-                                               FinalizeEntry *next = entry->next;
-                                               unsigned int major_hash;
-                                               /* remove from the list */
-                                               if (prev)
-                                                       prev->next = entry->next;
-                                               else
-                                                       finalizable_hash [i] = entry->next;
-                                               hash_table->num_registered--;
-
-                                               entry->object = copy;
-
-                                               /* insert it into the major hash */
-                                               rehash_fin_table_if_necessary (&major_finalizable_hash);
-                                               major_hash = mono_object_hash ((MonoObject*) copy) %
-                                                       major_finalizable_hash.size;
-                                               entry->next = major_finalizable_hash.table [major_hash];
-                                               major_finalizable_hash.table [major_hash] = entry;
-                                               major_finalizable_hash.num_registered++;
-
-                                               DEBUG (5, fprintf (gc_debug_file, "Promoting finalization of object %p (%s) (was at %p) to major table\n", copy, safe_name (copy), from));
-
-                                               entry = next;
-                                               continue;
-                                       } else {
-                                               /* update pointer */
-                                               DEBUG (5, fprintf (gc_debug_file, "Updating object for finalization: %p (%s) (was at %p)\n", entry->object, safe_name (entry->object), from));
-                                               entry->object = copy;
-                                       }
-                               }
-                       }
-                       prev = entry;
-                       entry = entry->next;
-               }
-       }
-}
-
 static int
 object_is_reachable (char *object, char *start, char *end)
 {
@@ -4388,6 +4126,8 @@ object_is_reachable (char *object, char *start, char *end)
        return !object_is_fin_ready (object) || major_collector.is_object_live (object);
 }
 
+#include "sgen-fin-weak-hash.c"
+
 gboolean
 mono_sgen_object_is_live (void *obj)
 {
@@ -4486,11 +4226,11 @@ clear_unreachable_ephemerons (CopyOrMarkObjectFunc copy_func, char *start, char
                        if (was_promoted) {
                                if (ptr_in_nursery (key)) {/*key was not promoted*/
                                        DEBUG (5, fprintf (gc_debug_file, "\tAdded remset to key %p\n", key));
-                                       mono_sgen_add_to_global_remset (queue->allocator, &cur->key);
+                                       mono_sgen_add_to_global_remset (&cur->key);
                                }
                                if (ptr_in_nursery (cur->value)) {/*value was not promoted*/
                                        DEBUG (5, fprintf (gc_debug_file, "\tAdded remset to value %p\n", cur->value));
-                                       mono_sgen_add_to_global_remset (queue->allocator, &cur->value);
+                                       mono_sgen_add_to_global_remset (&cur->value);
                                }
                        }
                }
@@ -4557,352 +4297,6 @@ mark_ephemerons_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end
        return nothing_marked;
 }
 
-/* LOCKING: requires that the GC lock is held */
-static void
-null_link_in_range (CopyOrMarkObjectFunc copy_func, char *start, char *end, int generation, gboolean before_finalization, GrayQueue *queue)
-{
-       DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
-       DisappearingLink **disappearing_link_hash = hash->table;
-       int disappearing_link_hash_size = hash->size;
-       DisappearingLink *entry, *prev;
-       int i;
-       if (!hash->num_links)
-               return;
-       for (i = 0; i < disappearing_link_hash_size; ++i) {
-               prev = NULL;
-               for (entry = disappearing_link_hash [i]; entry;) {
-                       char *object;
-                       gboolean track = DISLINK_TRACK (entry);
-
-                       /*
-                        * Tracked references are processed after
-                        * finalization handling whereas standard weak
-                        * references are processed before.  If an
-                        * object is still not marked after finalization
-                        * handling it means that it either doesn't have
-                        * a finalizer or the finalizer has already run,
-                        * so we must null a tracking reference.
-                        */
-                       if (track == before_finalization) {
-                               prev = entry;
-                               entry = entry->next;
-                               continue;
-                       }
-
-                       object = DISLINK_OBJECT (entry);
-
-                       if (object >= start && object < end && !major_collector.is_object_live (object)) {
-                               if (object_is_fin_ready (object)) {
-                                       void **p = entry->link;
-                                       DisappearingLink *old;
-                                       *p = NULL;
-                                       /* remove from list */
-                                       if (prev)
-                                               prev->next = entry->next;
-                                       else
-                                               disappearing_link_hash [i] = entry->next;
-                                       DEBUG (5, fprintf (gc_debug_file, "Dislink nullified at %p to GCed object %p\n", p, object));
-                                       old = entry->next;
-                                       mono_sgen_free_internal (entry, INTERNAL_MEM_DISLINK);
-                                       entry = old;
-                                       hash->num_links--;
-                                       continue;
-                               } else {
-                                       char *copy = object;
-                                       copy_func ((void**)&copy, queue);
-
-                                       /* Update pointer if it's moved.  If the object
-                                        * has been moved out of the nursery, we need to
-                                        * remove the link from the minor hash table to
-                                        * the major one.
-                                        *
-                                        * FIXME: what if an object is moved earlier?
-                                        */
-
-                                       if (hash == &minor_disappearing_link_hash && !ptr_in_nursery (copy)) {
-                                               void **link = entry->link;
-                                               DisappearingLink *old;
-                                               /* remove from list */
-                                               if (prev)
-                                                       prev->next = entry->next;
-                                               else
-                                                       disappearing_link_hash [i] = entry->next;
-                                               old = entry->next;
-                                               mono_sgen_free_internal (entry, INTERNAL_MEM_DISLINK);
-                                               entry = old;
-                                               hash->num_links--;
-
-                                               add_or_remove_disappearing_link ((MonoObject*)copy, link,
-                                                       track, GENERATION_OLD);
-
-                                               DEBUG (5, fprintf (gc_debug_file, "Upgraded dislink at %p to major because object %p moved to %p\n", link, object, copy));
-
-                                               continue;
-                                       } else {
-                                               *entry->link = HIDE_POINTER (copy, track);
-                                               DEBUG (5, fprintf (gc_debug_file, "Updated dislink at %p to %p\n", entry->link, DISLINK_OBJECT (entry)));
-                                       }
-                               }
-                       }
-                       prev = entry;
-                       entry = entry->next;
-               }
-       }
-}
-
-/* LOCKING: requires that the GC lock is held */
-static void
-null_links_for_domain (MonoDomain *domain, int generation)
-{
-       DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
-       DisappearingLink **disappearing_link_hash = hash->table;
-       int disappearing_link_hash_size = hash->size;
-       DisappearingLink *entry, *prev;
-       int i;
-       for (i = 0; i < disappearing_link_hash_size; ++i) {
-               prev = NULL;
-               for (entry = disappearing_link_hash [i]; entry; ) {
-                       char *object = DISLINK_OBJECT (entry);
-                       if (object && !((MonoObject*)object)->vtable) {
-                               DisappearingLink *next = entry->next;
-
-                               if (prev)
-                                       prev->next = next;
-                               else
-                                       disappearing_link_hash [i] = next;
-
-                               if (*(entry->link)) {
-                                       *(entry->link) = NULL;
-                                       g_warning ("Disappearing link %p not freed", entry->link);
-                               } else {
-                                       mono_sgen_free_internal (entry, INTERNAL_MEM_DISLINK);
-                               }
-
-                               entry = next;
-                               continue;
-                       }
-                       prev = entry;
-                       entry = entry->next;
-               }
-       }
-}
-
-/* LOCKING: requires that the GC lock is held */
-static int
-finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int out_size,
-       FinalizeEntryHashTable *hash_table)
-{
-       FinalizeEntry **finalizable_hash = hash_table->table;
-       mword finalizable_hash_size = hash_table->size;
-       FinalizeEntry *entry, *prev;
-       int i, count;
-
-       if (no_finalize || !out_size || !out_array)
-               return 0;
-       count = 0;
-       for (i = 0; i < finalizable_hash_size; ++i) {
-               prev = NULL;
-               for (entry = finalizable_hash [i]; entry;) {
-                       if (mono_object_domain (entry->object) == domain) {
-                               FinalizeEntry *next;
-                               /* remove and put in out_array */
-                               if (prev)
-                                       prev->next = entry->next;
-                               else
-                                       finalizable_hash [i] = entry->next;
-                               next = entry->next;
-                               hash_table->num_registered--;
-                               out_array [count ++] = entry->object;
-                               DEBUG (5, fprintf (gc_debug_file, "Collecting object for finalization: %p (%s) (%d/%d)\n", entry->object, safe_name (entry->object), num_ready_finalizers, hash_table->num_registered));
-                               entry = next;
-                               if (count == out_size)
-                                       return count;
-                               continue;
-                       }
-                       prev = entry;
-                       entry = entry->next;
-               }
-       }
-       return count;
-}
-
-/**
- * mono_gc_finalizers_for_domain:
- * @domain: the unloading appdomain
- * @out_array: output array
- * @out_size: size of output array
- *
- * Store inside @out_array up to @out_size objects that belong to the unloading
- * appdomain @domain. Returns the number of stored items. Can be called repeteadly
- * until it returns 0.
- * The items are removed from the finalizer data structure, so the caller is supposed
- * to finalize them.
- * @out_array should be on the stack to allow the GC to know the objects are still alive.
- */
-int
-mono_gc_finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int out_size)
-{
-       int result;
-
-       LOCK_GC;
-       result = finalizers_for_domain (domain, out_array, out_size, &minor_finalizable_hash);
-       if (result < out_size) {
-               result += finalizers_for_domain (domain, out_array + result, out_size - result,
-                       &major_finalizable_hash);
-       }
-       UNLOCK_GC;
-
-       return result;
-}
-
-static void
-register_for_finalization (MonoObject *obj, void *user_data, int generation)
-{
-       FinalizeEntryHashTable *hash_table = get_finalize_entry_hash_table (generation);
-       FinalizeEntry **finalizable_hash;
-       mword finalizable_hash_size;
-       FinalizeEntry *entry, *prev;
-       unsigned int hash;
-       if (no_finalize)
-               return;
-       g_assert (user_data == NULL || user_data == mono_gc_run_finalize);
-       hash = mono_object_hash (obj);
-       LOCK_GC;
-       rehash_fin_table_if_necessary (hash_table);
-       finalizable_hash = hash_table->table;
-       finalizable_hash_size = hash_table->size;
-       hash %= finalizable_hash_size;
-       prev = NULL;
-       for (entry = finalizable_hash [hash]; entry; entry = entry->next) {
-               if (entry->object == obj) {
-                       if (!user_data) {
-                               /* remove from the list */
-                               if (prev)
-                                       prev->next = entry->next;
-                               else
-                                       finalizable_hash [hash] = entry->next;
-                               hash_table->num_registered--;
-                               DEBUG (5, fprintf (gc_debug_file, "Removed finalizer %p for object: %p (%s) (%d)\n", entry, obj, obj->vtable->klass->name, hash_table->num_registered));
-                               mono_sgen_free_internal (entry, INTERNAL_MEM_FINALIZE_ENTRY);
-                       }
-                       UNLOCK_GC;
-                       return;
-               }
-               prev = entry;
-       }
-       if (!user_data) {
-               /* request to deregister, but already out of the list */
-               UNLOCK_GC;
-               return;
-       }
-       entry = mono_sgen_alloc_internal (INTERNAL_MEM_FINALIZE_ENTRY);
-       entry->object = obj;
-       entry->next = finalizable_hash [hash];
-       finalizable_hash [hash] = entry;
-       hash_table->num_registered++;
-       DEBUG (5, fprintf (gc_debug_file, "Added finalizer %p for object: %p (%s) (%d) to %s table\n", entry, obj, obj->vtable->klass->name, hash_table->num_registered, generation_name (generation)));
-       UNLOCK_GC;
-}
-
-void
-mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
-{
-       if (ptr_in_nursery (obj))
-               register_for_finalization (obj, user_data, GENERATION_NURSERY);
-       else
-               register_for_finalization (obj, user_data, GENERATION_OLD);
-}
-
-static void
-rehash_dislink (DisappearingLinkHashTable *hash_table)
-{
-       DisappearingLink **disappearing_link_hash = hash_table->table;
-       int disappearing_link_hash_size = hash_table->size;
-       int i;
-       unsigned int hash;
-       DisappearingLink **new_hash;
-       DisappearingLink *entry, *next;
-       int new_size = g_spaced_primes_closest (hash_table->num_links);
-
-       new_hash = mono_sgen_alloc_internal_dynamic (new_size * sizeof (DisappearingLink*), INTERNAL_MEM_DISLINK_TABLE);
-       for (i = 0; i < disappearing_link_hash_size; ++i) {
-               for (entry = disappearing_link_hash [i]; entry; entry = next) {
-                       hash = mono_aligned_addr_hash (entry->link) % new_size;
-                       next = entry->next;
-                       entry->next = new_hash [hash];
-                       new_hash [hash] = entry;
-               }
-       }
-       mono_sgen_free_internal_dynamic (disappearing_link_hash,
-                       disappearing_link_hash_size * sizeof (DisappearingLink*), INTERNAL_MEM_DISLINK_TABLE);
-       hash_table->table = new_hash;
-       hash_table->size = new_size;
-}
-
-/* LOCKING: assumes the GC lock is held */
-static void
-add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation)
-{
-       DisappearingLinkHashTable *hash_table = get_dislink_hash_table (generation);
-       DisappearingLink *entry, *prev;
-       unsigned int hash;
-       DisappearingLink **disappearing_link_hash = hash_table->table;
-       int disappearing_link_hash_size = hash_table->size;
-
-       if (hash_table->num_links >= disappearing_link_hash_size * 2) {
-               rehash_dislink (hash_table);
-               disappearing_link_hash = hash_table->table;
-               disappearing_link_hash_size = hash_table->size;
-       }
-       /* FIXME: add check that link is not in the heap */
-       hash = mono_aligned_addr_hash (link) % disappearing_link_hash_size;
-       entry = disappearing_link_hash [hash];
-       prev = NULL;
-       for (; entry; entry = entry->next) {
-               /* link already added */
-               if (link == entry->link) {
-                       /* NULL obj means remove */
-                       if (obj == NULL) {
-                               if (prev)
-                                       prev->next = entry->next;
-                               else
-                                       disappearing_link_hash [hash] = entry->next;
-                               hash_table->num_links--;
-                               DEBUG (5, fprintf (gc_debug_file, "Removed dislink %p (%d) from %s table\n", entry, hash_table->num_links, generation_name (generation)));
-                               mono_sgen_free_internal (entry, INTERNAL_MEM_DISLINK);
-                               *link = NULL;
-                       } else {
-                               *link = HIDE_POINTER (obj, track); /* we allow the change of object */
-                       }
-                       return;
-               }
-               prev = entry;
-       }
-       if (obj == NULL)
-               return;
-       entry = mono_sgen_alloc_internal (INTERNAL_MEM_DISLINK);
-       *link = HIDE_POINTER (obj, track);
-       entry->link = link;
-       entry->next = disappearing_link_hash [hash];
-       disappearing_link_hash [hash] = entry;
-       hash_table->num_links++;
-       DEBUG (5, fprintf (gc_debug_file, "Added dislink %p for object: %p (%s) at %p to %s table\n", entry, obj, obj->vtable->klass->name, link, generation_name (generation)));
-}
-
-/* LOCKING: assumes the GC lock is held */
-static void
-mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track)
-{
-       add_or_remove_disappearing_link (NULL, link, FALSE, GENERATION_NURSERY);
-       add_or_remove_disappearing_link (NULL, link, FALSE, GENERATION_OLD);
-       if (obj) {
-               if (ptr_in_nursery (obj))
-                       add_or_remove_disappearing_link (obj, link, track, GENERATION_NURSERY);
-               else
-                       add_or_remove_disappearing_link (obj, link, track, GENERATION_OLD);
-       }
-}
-
 int
 mono_gc_invoke_finalizers (void)
 {
@@ -5224,8 +4618,8 @@ restart_threads_until_none_in_managed_allocator (void)
                        gboolean result;
                        if (info->skip)
                                continue;
-                       if (!info->stack_start || info->in_critical_region ||
-                                       is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
+                       if (!info->thread_is_dying && (!info->stack_start || info->in_critical_region ||
+                                       is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip))) {
                                binary_protocol_thread_restart ((gpointer)mono_thread_info_get_tid (info));
                                result = mono_sgen_resume_thread (info);
                                if (result) {
@@ -5281,13 +4675,10 @@ restart_threads_until_none_in_managed_allocator (void)
        return num_threads_died;
 }
 
-/* LOCKING: assumes the GC lock is held (by the stopping thread) */
 static void
-suspend_handler (int sig, siginfo_t *siginfo, void *context)
+suspend_thread (SgenThreadInfo *info, void *context)
 {
-       SgenThreadInfo *info;
        int stop_count;
-       int old_errno = errno;
 #ifdef USE_MONO_CTX
        MonoContext monoctx;
 #else
@@ -5295,35 +4686,38 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
 #endif
        gpointer stack_start;
 
-       info = mono_thread_info_current ();
-       if (!info)
-               /* This can happen while a thread is dying */
-               return;
+       g_assert (info->doing_handshake);
 
        info->stopped_domain = mono_domain_get ();
-       info->stopped_ip = (gpointer) ARCH_SIGCTX_IP (context);
+       info->stopped_ip = context ? (gpointer) ARCH_SIGCTX_IP (context) : NULL;
        stop_count = global_stop_count;
        /* duplicate signal */
-       if (0 && info->stop_count == stop_count) {
-               errno = old_errno;
+       if (0 && info->stop_count == stop_count)
                return;
-       }
 #ifdef HAVE_KW_THREAD
        /* update the remset info in the thread data structure */
        info->remset = remembered_set;
 #endif
-       stack_start = (char*) ARCH_SIGCTX_SP (context) - REDZONE_SIZE;
+       stack_start = context ? (char*) ARCH_SIGCTX_SP (context) - REDZONE_SIZE : NULL;
        /* If stack_start is not within the limits, then don't set it
           in info and we will be restarted. */
        if (stack_start >= info->stack_start_limit && info->stack_start <= info->stack_end) {
                info->stack_start = stack_start;
 
 #ifdef USE_MONO_CTX
-               mono_sigctx_to_monoctx (context, &monoctx);
-               info->monoctx = &monoctx;
+               if (context) {
+                       mono_sigctx_to_monoctx (context, &monoctx);
+                       info->monoctx = &monoctx;
+               } else {
+                       info->monoctx = NULL;
+               }
 #else
-               ARCH_COPY_SIGCTX_REGS (regs, context);
-               info->stopped_regs = regs;
+               if (context) {
+                       ARCH_COPY_SIGCTX_REGS (regs, context);
+                       info->stopped_regs = regs;
+               } else {
+                       info->stopped_regs = NULL;
+               }
 #endif
        } else {
                g_assert (!info->stack_start);
@@ -5342,11 +4736,28 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        do {
                info->signal = 0;
                sigsuspend (&suspend_signal_mask);
-       } while (info->signal != restart_signal_num);
+       } while (info->signal != restart_signal_num && info->doing_handshake);
 
        DEBUG (4, fprintf (gc_debug_file, "Posting suspend_ack_semaphore for resume from %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
        /* notify the waiting thread */
        MONO_SEM_POST (suspend_ack_semaphore_ptr);
+}
+
+/* LOCKING: assumes the GC lock is held (by the stopping thread) */
+static void
+suspend_handler (int sig, siginfo_t *siginfo, void *context)
+{
+       SgenThreadInfo *info;
+       int old_errno = errno;
+
+       info = mono_thread_info_current ();
+
+       if (info) {
+               suspend_thread (info, context);
+       } else {
+               /* This can happen while a thread is dying */
+               g_print ("no thread info in suspend\n");
+       }
 
        errno = old_errno;
 }
@@ -5358,8 +4769,15 @@ restart_handler (int sig)
        int old_errno = errno;
 
        info = mono_thread_info_current ();
-       info->signal = restart_signal_num;
-       DEBUG (4, fprintf (gc_debug_file, "Restart handler in %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
+
+       /*
+        * If a thread is dying there might be no thread info.  In
+        * that case we rely on info->doing_handshake.
+        */
+       if (info) {
+               info->signal = restart_signal_num;
+               DEBUG (4, fprintf (gc_debug_file, "Restart handler in %p %p\n", info, (gpointer)mono_native_thread_id_get ()));
+       }
 
        errno = old_errno;
 }
@@ -5368,11 +4786,13 @@ static void
 acquire_gc_locks (void)
 {
        LOCK_INTERRUPTION;
+       mono_thread_info_suspend_lock ();
 }
 
 static void
 release_gc_locks (void)
 {
+       mono_thread_info_suspend_unlock ();
        UNLOCK_INTERRUPTION;
 }
 
@@ -5435,6 +4855,9 @@ restart_world (int generation)
        max_pause_usec = MAX (usec, max_pause_usec);
        DEBUG (2, fprintf (gc_debug_file, "restarted %d thread(s) (pause time: %d usec, max: %d)\n", count, (int)usec, (int)max_pause_usec));
        mono_profiler_gc_event (MONO_GC_EVENT_POST_START_WORLD, generation);
+
+       bridge_process ();
+
        return count;
 }
 
@@ -5472,10 +4895,14 @@ mono_gc_scan_object (void *obj)
 {
        UserCopyOrMarkData *data = pthread_getspecific (user_copy_or_mark_key);
 
-       if (current_collection_generation == GENERATION_NURSERY)
-               major_collector.copy_object (&obj, data->queue);
-       else
+       if (current_collection_generation == GENERATION_NURSERY) {
+               if (collection_is_parallel ())
+                       major_collector.copy_object (&obj, data->queue);
+               else
+                       major_collector.nopar_copy_object (&obj, data->queue);
+       } else {
                major_collector.copy_or_mark_object (&obj, data->queue);
+       }
        return obj;
 }
 
@@ -5502,15 +4929,16 @@ scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise, Gray
                        gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
                        set_user_copy_or_mark_data (NULL);
                } else if (!precise) {
-                       conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
+                       if (!info->thread_is_dying)
+                               conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
                }
 
 #ifdef USE_MONO_CTX
-               if (!precise)
+               if (!info->thread_is_dying && !precise)
                        conservatively_pin_objects_from ((void**)info->monoctx, (void**)info->monoctx + ARCH_NUM_REGS,
                                start_nursery, end_nursery, PIN_TYPE_STACK);
 #else
-               if (!precise)
+               if (!info->thread_is_dying && !precise)
                        conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
                                        start_nursery, end_nursery, PIN_TYPE_STACK);
 #endif
@@ -5588,7 +5016,7 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                                 * becomes part of the global remset, which can grow very large.
                                 */
                                DEBUG (9, fprintf (gc_debug_file, "Add to global remset because of pinning %p (%p %s)\n", ptr, *ptr, safe_name (*ptr)));
-                               mono_sgen_add_to_global_remset (queue->allocator, ptr);
+                               mono_sgen_add_to_global_remset (ptr);
                        }
                } else {
                        DEBUG (9, fprintf (gc_debug_file, "Skipping remset at %p holding %p\n", ptr, *ptr));
@@ -5603,7 +5031,7 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                        major_collector.copy_object (ptr, queue);
                        DEBUG (9, fprintf (gc_debug_file, "Overwrote remset at %p with %p (count: %d)\n", ptr, *ptr, (int)count));
                        if (!global && *ptr >= start_nursery && *ptr < end_nursery)
-                               mono_sgen_add_to_global_remset (queue->allocator, ptr);
+                               mono_sgen_add_to_global_remset (ptr);
                        ++ptr;
                }
                return p + 2;
@@ -5611,17 +5039,23 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
                if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery))
                        return p + 1;
-               major_collector.minor_scan_object ((char*)ptr, queue);
+               mono_sgen_get_minor_scan_object () ((char*)ptr, queue);
                return p + 1;
        case REMSET_VTYPE: {
+               ScanVTypeFunc scan_vtype = mono_sgen_get_minor_scan_vtype ();
+               size_t skip_size;
+
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
                if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery))
-                       return p + 3;
+                       return p + 4;
                desc = p [1];
                count = p [2];
-               while (count-- > 0)
-                       ptr = (void**) major_collector.minor_scan_vtype ((char*)ptr, desc, start_nursery, end_nursery, queue);
-               return p + 3;
+               skip_size = p [3];
+               while (count-- > 0) {
+                       scan_vtype ((char*)ptr, desc, queue);
+                       ptr = (void**)((char*)ptr + skip_size);
+               }
+               return p + 4;
        }
        default:
                g_assert_not_reached ();
@@ -5660,7 +5094,7 @@ collect_store_remsets (RememberedSet *remset, mword *bumper)
                        p += 1;
                        break;
                case REMSET_VTYPE:
-                       p += 3;
+                       p += 4;
                        break;
                default:
                        g_assert_not_reached ();
@@ -5722,7 +5156,9 @@ static void
 clear_thread_store_remset_buffer (SgenThreadInfo *info)
 {
        *info->store_remset_buffer_index_addr = 0;
-       memset (*info->store_remset_buffer_addr, 0, sizeof (gpointer) * STORE_REMSET_BUFFER_SIZE);
+       /* See the comment at the end of sgen_thread_unregister() */
+       if (*info->store_remset_buffer_addr)
+               memset (*info->store_remset_buffer_addr, 0, sizeof (gpointer) * STORE_REMSET_BUFFER_SIZE);
 }
 
 static size_t
@@ -5853,8 +5289,7 @@ clear_remsets (void)
                remset->next = NULL;
                if (remset != global_remset) {
                        DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                       mono_sgen_free_internal_dynamic_delayed (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET,
-                                       mono_sgen_get_unmanaged_allocator ());
+                       mono_sgen_free_internal_dynamic (remset, remset_byte_size (remset), INTERNAL_MEM_REMSET);
                }
        }
        /* the generic store ones */
@@ -5923,6 +5358,8 @@ sgen_thread_register (SgenThreadInfo* info, void *addr)
        info->stop_count = -1;
        info->skip = 0;
        info->signal = 0;
+       info->doing_handshake = FALSE;
+       info->thread_is_dying = FALSE;
        info->stack_start = NULL;
        info->tlab_start_addr = &TLAB_START;
        info->tlab_next_addr = &TLAB_NEXT;
@@ -5978,7 +5415,7 @@ sgen_thread_register (SgenThreadInfo* info, void *addr)
        stack_end = info->stack_end;
 #endif
 
-       info->remset = alloc_remset (DEFAULT_REMSET_SIZE, info);
+       info->remset = alloc_remset (DEFAULT_REMSET_SIZE, info, FALSE);
        pthread_setspecific (remembered_set_key, info->remset);
 #ifdef HAVE_KW_THREAD
        remembered_set = info->remset;
@@ -6019,7 +5456,38 @@ sgen_thread_unregister (SgenThreadInfo *p)
        if (mono_domain_get ())
                mono_thread_detach (mono_thread_current ());
 
+       p->thread_is_dying = TRUE;
+
+       /*
+       There is a race condition between a thread finishing executing and been removed
+       from the GC thread set.
+       This happens on posix systems when TLS data is been cleaned-up, libpthread will
+       set the thread_info slot to NULL before calling the cleanup function. This
+       opens a window in which the thread is registered but has a NULL TLS.
+
+       The suspend signal handler needs TLS data to know where to store thread state
+       data or otherwise it will simply ignore the thread.
+
+       This solution works because the thread doing STW will wait until all threads been
+       suspended handshake back, so there is no race between the doing_hankshake test
+       and the suspend_thread call.
+
+       This is not required on systems that do synchronous STW as those can deal with
+       the above race at suspend time.
+
+       FIXME: I believe we could avoid this by using mono_thread_info_lookup when
+       mono_thread_info_current returns NULL. Or fix mono_thread_info_lookup to do so.
+       */
+#if defined(__MACH__) && MONO_MACH_ARCH_SUPPORTED
        LOCK_GC;
+#else
+       while (!TRYLOCK_GC) {
+               if (p->doing_handshake)
+                       suspend_thread (p, NULL);
+               else
+                       usleep (50);
+       }
+#endif
 
        binary_protocol_thread_unregister ((gpointer)id);
        DEBUG (3, fprintf (gc_debug_file, "unregister thread %p (%p)\n", p, (gpointer)mono_thread_info_get_tid (p)));
@@ -6046,6 +5514,18 @@ sgen_thread_unregister (SgenThreadInfo *p)
        if (*p->store_remset_buffer_index_addr)
                add_generic_store_remset_from_buffer (*p->store_remset_buffer_addr);
        mono_sgen_free_internal (*p->store_remset_buffer_addr, INTERNAL_MEM_STORE_REMSET);
+       /*
+        * This is currently not strictly required, but we do it
+        * anyway in case we change thread unregistering:
+
+        * If the thread is removed from the thread list after
+        * unregistering (this is currently not the case), and a
+        * collection occurs, clear_remsets() would want to memset
+        * this buffer, which would either clobber memory or crash.
+        */
+       *p->store_remset_buffer_addr = NULL;
+
+       mono_threads_unregister_current_thread (p);
        UNLOCK_GC;
 }
 
@@ -6099,7 +5579,7 @@ mono_gc_set_stack_end (void *stack_end)
 int
 mono_gc_pthread_create (pthread_t *new_thread, const pthread_attr_t *attr, void *(*start_routine)(void *), void *arg)
 {
-       return mono_threads_pthread_create (new_thread, attr, start_routine, arg);
+       return pthread_create (new_thread, attr, start_routine, arg);
 }
 
 int
@@ -6132,23 +5612,13 @@ dummy_use (gpointer v) {
 
 
 static RememberedSet*
-alloc_remset (int size, gpointer id) {
-       RememberedSet* res = mono_sgen_alloc_internal_dynamic (sizeof (RememberedSet) + (size * sizeof (gpointer)), INTERNAL_MEM_REMSET);
-       res->store_next = res->data;
-       res->end_set = res->data + size;
-       res->next = NULL;
-       DEBUG (4, fprintf (gc_debug_file, "Allocated remset size %d at %p for %p\n", size, res->data, id));
-       return res;
-}
-
-static RememberedSet*
-alloc_global_remset (SgenInternalAllocator *alc, int size, gpointer id)
+alloc_remset (int size, gpointer id, gboolean global)
 {
-       RememberedSet* res = mono_sgen_alloc_internal_full (alc, sizeof (RememberedSet) + (size * sizeof (gpointer)), INTERNAL_MEM_REMSET);
+       RememberedSet* res = mono_sgen_alloc_internal_dynamic (sizeof (RememberedSet) + (size * sizeof (gpointer)), INTERNAL_MEM_REMSET);
        res->store_next = res->data;
        res->end_set = res->data + size;
        res->next = NULL;
-       DEBUG (4, fprintf (gc_debug_file, "Allocated global remset size %d at %p for %p\n", size, res->data, id));
+       DEBUG (4, fprintf (gc_debug_file, "Allocated%s remset size %d at %p for %p\n", global ? " global" : "", size, res->data, id));
        return res;
 }
 
@@ -6184,7 +5654,7 @@ mono_gc_wbarrier_set_field (MonoObject *obj, gpointer field_ptr, MonoObject* val
                        UNLOCK_GC;
                        return;
                }
-               rs = alloc_remset (rs->end_set - rs->data, (void*)1);
+               rs = alloc_remset (rs->end_set - rs->data, (void*)1, FALSE);
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
@@ -6222,7 +5692,7 @@ mono_gc_wbarrier_set_arrayref (MonoArray *arr, gpointer slot_ptr, MonoObject* va
                        UNLOCK_GC;
                        return;
                }
-               rs = alloc_remset (rs->end_set - rs->data, (void*)1);
+               rs = alloc_remset (rs->end_set - rs->data, (void*)1, FALSE);
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
@@ -6240,7 +5710,7 @@ mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
        HEAVY_STAT (++stat_wbarrier_arrayref_copy);
        /*This check can be done without taking a lock since dest_ptr array is pinned*/
        if (ptr_in_nursery (dest_ptr) || count <= 0) {
-               memmove (dest_ptr, src_ptr, count * sizeof (gpointer));
+               mono_gc_memmove (dest_ptr, src_ptr, count * sizeof (gpointer));
                return;
        }
 
@@ -6275,7 +5745,7 @@ mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
                RememberedSet *rs;
                TLAB_ACCESS_INIT;
                LOCK_GC;
-               memmove (dest_ptr, src_ptr, count * sizeof (gpointer));
+               mono_gc_memmove (dest_ptr, src_ptr, count * sizeof (gpointer));
 
                rs = REMEMBERED_SET;
                DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p, %d\n", dest_ptr, count));
@@ -6285,7 +5755,7 @@ mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
                        UNLOCK_GC;
                        return;
                }
-               rs = alloc_remset (rs->end_set - rs->data, (void*)1);
+               rs = alloc_remset (rs->end_set - rs->data, (void*)1, FALSE);
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
@@ -6455,11 +5925,22 @@ mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *
        TLAB_ACCESS_INIT;
        HEAVY_STAT (++stat_wbarrier_value_copy);
        g_assert (klass->valuetype);
-       LOCK_GC;
-       memmove (dest, src, size);
        if (use_cardtable) {
+#ifdef DISABLE_CRITICAL_REGION
+               LOCK_GC;
+#else
+               ENTER_CRITICAL_REGION;
+#endif
+               mono_gc_memmove (dest, src, size);
                sgen_card_table_mark_range ((mword)dest, size);
+#ifdef DISABLE_CRITICAL_REGION
+               UNLOCK_GC;
+#else
+               EXIT_CRITICAL_REGION;
+#endif
        } else {
+               LOCK_GC;
+               mono_gc_memmove (dest, src, size);
                rs = REMEMBERED_SET;
                if (ptr_in_nursery (dest) || ptr_on_stack (dest) || !SGEN_CLASS_HAS_REFERENCES (klass)) {
                        UNLOCK_GC;
@@ -6468,14 +5949,15 @@ mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *
                g_assert (klass->gc_descr_inited);
                DEBUG (8, fprintf (gc_debug_file, "Adding value remset at %p, count %d, descr %p for class %s (%p)\n", dest, count, klass->gc_descr, klass->name, klass));
 
-               if (rs->store_next + 3 < rs->end_set) {
+               if (rs->store_next + 4 < rs->end_set) {
                        *(rs->store_next++) = (mword)dest | REMSET_VTYPE;
                        *(rs->store_next++) = (mword)klass->gc_descr;
                        *(rs->store_next++) = (mword)count;
+                       *(rs->store_next++) = (mword)size;
                        UNLOCK_GC;
                        return;
                }
-               rs = alloc_remset (rs->end_set - rs->data, (void*)1);
+               rs = alloc_remset (rs->end_set - rs->data, (void*)1, FALSE);
                rs->next = REMEMBERED_SET;
                REMEMBERED_SET = rs;
 #ifdef HAVE_KW_THREAD
@@ -6484,8 +5966,9 @@ mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *
                *(rs->store_next++) = (mword)dest | REMSET_VTYPE;
                *(rs->store_next++) = (mword)klass->gc_descr;
                *(rs->store_next++) = (mword)count;
+               *(rs->store_next++) = (mword)size;
+               UNLOCK_GC;
        }
-       UNLOCK_GC;
 }
 
 /**
@@ -6506,7 +5989,7 @@ mono_gc_wbarrier_object_copy (MonoObject* obj, MonoObject *src)
        size = mono_object_class (obj)->instance_size;
        LOCK_GC;
        /* do not copy the sync state */
-       memcpy ((char*)obj + sizeof (MonoObject), (char*)src + sizeof (MonoObject),
+       mono_gc_memmove ((char*)obj + sizeof (MonoObject), (char*)src + sizeof (MonoObject),
                        size - sizeof (MonoObject));
        if (ptr_in_nursery (obj) || ptr_on_stack (obj)) {
                UNLOCK_GC;
@@ -6517,7 +6000,7 @@ mono_gc_wbarrier_object_copy (MonoObject* obj, MonoObject *src)
                UNLOCK_GC;
                return;
        }
-       rs = alloc_remset (rs->end_set - rs->data, (void*)1);
+       rs = alloc_remset (rs->end_set - rs->data, (void*)1, FALSE);
        rs->next = REMEMBERED_SET;
        REMEMBERED_SET = rs;
 
@@ -6629,18 +6112,7 @@ find_in_remset_loc (mword *p, char *addr, gboolean *found)
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
                desc = p [1];
                count = p [2];
-
-               switch (desc & 0x7) {
-               case DESC_TYPE_RUN_LENGTH:
-                       OBJ_RUN_LEN_SIZE (skip_size, desc, ptr);
-                       break;
-               case DESC_TYPE_SMALL_BITMAP:
-                       OBJ_BITMAP_SIZE (skip_size, desc, start);
-                       break;
-               default:
-                       // FIXME:
-                       g_assert_not_reached ();
-               }
+               skip_size = p [3];
 
                /* The descriptor includes the size of MonoObject */
                skip_size -= sizeof (MonoObject);
@@ -6648,7 +6120,7 @@ find_in_remset_loc (mword *p, char *addr, gboolean *found)
                if ((void**)addr >= ptr && (void**)addr < ptr + (skip_size / sizeof (gpointer)))
                        *found = TRUE;
 
-               return p + 3;
+               return p + 4;
        default:
                g_assert_not_reached ();
        }
@@ -6726,7 +6198,7 @@ static gboolean missing_remsets;
  */
 #undef HANDLE_PTR
 #define HANDLE_PTR(ptr,obj)    do {    \
-               if (*(ptr) && (char*)*(ptr) >= nursery_start && (char*)*(ptr) < nursery_next) { \
+               if (*(ptr) && (char*)*(ptr) >= nursery_start && (char*)*(ptr) < nursery_end) {  \
                if (!find_in_remsets ((char*)(ptr)) && (!use_cardtable || !sgen_card_table_address_is_marked ((mword)ptr))) { \
                 fprintf (gc_debug_file, "Oldspace->newspace reference %p at offset %td in object %p (%s.%s) not found in remsets.\n", *(ptr), (char*)(ptr) - (char*)(obj), (obj), ((MonoObject*)(obj))->vtable->klass->name_space, ((MonoObject*)(obj))->vtable->klass->name); \
                binary_protocol_missing_remset ((obj), (gpointer)LOAD_VTABLE ((obj)), (char*)(ptr) - (char*)(obj), *(ptr), (gpointer)LOAD_VTABLE(*(ptr)), object_is_pinned (*(ptr))); \
@@ -6894,7 +6366,7 @@ mono_gc_walk_heap (int flags, MonoGCReferences callback, void *data)
        hwi.callback = callback;
        hwi.data = data;
 
-       clear_nursery_fragments (nursery_next);
+       mono_sgen_clear_nursery_fragments ();
        mono_sgen_scan_area_with_callback (nursery_section->data, nursery_section->end_data, walk_references, &hwi, FALSE);
 
        major_collector.iterate_objects (TRUE, TRUE, walk_references, &hwi);
@@ -6998,17 +6470,13 @@ mono_gc_enable_events (void)
 void
 mono_gc_weak_link_add (void **link_addr, MonoObject *obj, gboolean track)
 {
-       LOCK_GC;
-       mono_gc_register_disappearing_link (obj, link_addr, track);
-       UNLOCK_GC;
+       mono_gc_register_disappearing_link (obj, link_addr, track, FALSE);
 }
 
 void
 mono_gc_weak_link_remove (void **link_addr)
 {
-       LOCK_GC;
-       mono_gc_register_disappearing_link (NULL, link_addr, FALSE);
-       UNLOCK_GC;
+       mono_gc_register_disappearing_link (NULL, link_addr, FALSE, FALSE);
 }
 
 MonoObject*
@@ -7136,6 +6604,12 @@ mono_gc_is_gc_thread (void)
        return result;
 }
 
+static gboolean
+is_critical_method (MonoMethod *method)
+{
+       return mono_runtime_is_critical_method (method) || mono_gc_is_critical_method (method);
+}
+
 void
 mono_gc_base_init (void)
 {
@@ -7146,23 +6620,37 @@ mono_gc_base_init (void)
        struct sigaction sinfo;
        glong max_heap = 0;
        int num_workers;
+       int result;
+
+       do {
+               result = InterlockedCompareExchange (&gc_initialized, -1, 0);
+               switch (result) {
+               case 1:
+                       /* already inited */
+                       return;
+               case -1:
+                       /* being inited by another thread */
+                       usleep (1000);
+                       break;
+               case 0:
+                       /* we will init it */
+                       break;
+               default:
+                       g_assert_not_reached ();
+               }
+       } while (result != 0);
 
-       /* the gc_initialized guard seems to imply this method is
-          idempotent, but LOCK_INIT(gc_mutex) might not be.  It's
-          defined in sgen-gc.h as nothing, so there's no danger at
-          present. */
        LOCK_INIT (gc_mutex);
-       LOCK_GC;
-       if (gc_initialized) {
-               UNLOCK_GC;
-               return;
-       }
+
        pagesize = mono_pagesize ();
-       gc_debug_file = stdout;
+       gc_debug_file = stderr;
 
        cb.thread_register = sgen_thread_register;
        cb.thread_unregister = sgen_thread_unregister;
        cb.thread_attach = sgen_thread_attach;
+       cb.mono_method_is_critical = (gpointer)is_critical_method;
+       cb.mono_gc_pthread_create = (gpointer)mono_gc_pthread_create;
+
        mono_threads_init (&cb, sizeof (SgenThreadInfo));
 
        LOCK_INIT (interruption_mutex);
@@ -7184,8 +6672,8 @@ mono_gc_base_init (void)
 
        init_stats ();
        mono_sgen_init_internal_allocator ();
+       mono_sgen_init_nursery_allocator ();
 
-       mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_FRAGMENT, sizeof (Fragment));
        mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_SECTION, SGEN_SIZEOF_GC_MEM_SECTION);
        mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_FINALIZE_ENTRY, sizeof (FinalizeEntry));
        mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_DISLINK, sizeof (DisappearingLink));
@@ -7195,6 +6683,37 @@ mono_gc_base_init (void)
        mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_STORE_REMSET, sizeof (GenericStoreRememberedSet));
        mono_sgen_register_fixed_internal_mem_type (INTERNAL_MEM_EPHEMERON_LINK, sizeof (EphemeronLinkNode));
 
+       pthread_key_create (&remembered_set_key, NULL);
+
+#ifndef HAVE_KW_THREAD
+       pthread_key_create (&thread_info_key, NULL);
+#endif
+
+       /*
+        * This needs to happen before any internal allocations because
+        * it inits the small id which is required for hazard pointer
+        * operations.
+        */
+       suspend_ack_semaphore_ptr = &suspend_ack_semaphore;
+       MONO_SEM_INIT (&suspend_ack_semaphore, 0);
+
+       sigfillset (&sinfo.sa_mask);
+       sinfo.sa_flags = SA_RESTART | SA_SIGINFO;
+       sinfo.sa_sigaction = suspend_handler;
+       if (sigaction (suspend_signal_num, &sinfo, NULL) != 0) {
+               g_error ("failed sigaction");
+       }
+
+       sinfo.sa_handler = restart_handler;
+       if (sigaction (restart_signal_num, &sinfo, NULL) != 0) {
+               g_error ("failed sigaction");
+       }
+
+       sigfillset (&suspend_signal_mask);
+       sigdelset (&suspend_signal_mask, restart_signal_num);
+
+       mono_thread_info_attach (&sinfo);
+
        if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep")) {
                mono_sgen_marksweep_init (&major_collector);
        } else if (!major_collector_opt || !strcmp (major_collector_opt, "marksweep-fixed")) {
@@ -7355,6 +6874,8 @@ mono_gc_base_init (void)
                                                gc_debug_file = stderr;
                                        g_free (rf);
                                }
+                       } else if (!strcmp (opt, "print-allowance")) {
+                               debug_print_allowance = TRUE;
                        } else if (!strcmp (opt, "collect-before-allocs")) {
                                collect_before_allocs = 1;
                        } else if (g_str_has_prefix (opt, "collect-before-allocs=")) {
@@ -7396,6 +6917,7 @@ mono_gc_base_init (void)
                                fprintf (stderr, "  disable-major\n");
                                fprintf (stderr, "  xdomain-checks\n");
                                fprintf (stderr, "  clear-at-gc\n");
+                               fprintf (stderr, "  print-allowance\n");
                                exit (1);
                        }
                }
@@ -7405,39 +6927,13 @@ mono_gc_base_init (void)
        if (major_collector.post_param_init)
                major_collector.post_param_init ();
 
-       suspend_ack_semaphore_ptr = &suspend_ack_semaphore;
-       MONO_SEM_INIT (&suspend_ack_semaphore, 0);
-
-       sigfillset (&sinfo.sa_mask);
-       sinfo.sa_flags = SA_RESTART | SA_SIGINFO;
-       sinfo.sa_sigaction = suspend_handler;
-       if (sigaction (suspend_signal_num, &sinfo, NULL) != 0) {
-               g_error ("failed sigaction");
-       }
-
-       sinfo.sa_handler = restart_handler;
-       if (sigaction (restart_signal_num, &sinfo, NULL) != 0) {
-               g_error ("failed sigaction");
-       }
-
-       sigfillset (&suspend_signal_mask);
-       sigdelset (&suspend_signal_mask, restart_signal_num);
-
-       global_remset = alloc_remset (1024, NULL);
+       global_remset = alloc_remset (1024, NULL, FALSE);
        global_remset->next = NULL;
 
-       pthread_key_create (&remembered_set_key, NULL);
-
-#ifndef HAVE_KW_THREAD
-       pthread_key_create (&thread_info_key, NULL);
-#endif
-
        if (use_cardtable)
                card_table_init ();
 
-       gc_initialized = TRUE;
-       UNLOCK_GC;
-       mono_thread_info_attach (&sinfo);
+       gc_initialized = 1;
 }
 
 int
@@ -7652,11 +7148,6 @@ create_allocator (int atype)
        mono_mb_emit_byte (mb, CEE_ADD);
        mono_mb_emit_stloc (mb, new_next_var);
 
-       /* tlab_next = new_next */
-       mono_mb_emit_ldloc (mb, tlab_next_addr_var);
-       mono_mb_emit_ldloc (mb, new_next_var);
-       mono_mb_emit_byte (mb, CEE_STIND_I);
-
        /* if (G_LIKELY (new_next < tlab_temp_end)) */
        mono_mb_emit_ldloc (mb, new_next_var);
        EMIT_TLS_ACCESS (mb, tlab_temp_end, tlab_temp_end_offset);
@@ -7687,6 +7178,15 @@ create_allocator (int atype)
 
        /* FIXME: Memory barrier */
 
+       /* tlab_next = new_next */
+       mono_mb_emit_ldloc (mb, tlab_next_addr_var);
+       mono_mb_emit_ldloc (mb, new_next_var);
+       mono_mb_emit_byte (mb, CEE_STIND_I);
+
+       /*The tlab store must be visible before the the vtable store. This could be replaced with a DDS but doing it with IL would be tricky. */
+       mono_mb_emit_byte ((mb), MONO_CUSTOM_PREFIX);
+       mono_mb_emit_op (mb, CEE_MONO_MEMORY_BARRIER, StoreStoreBarrier);
+
        /* *p = vtable; */
        mono_mb_emit_ldloc (mb, p_var);
        mono_mb_emit_ldarg (mb, 0);
@@ -7700,6 +7200,12 @@ create_allocator (int atype)
                mono_mb_emit_byte (mb, CEE_STIND_I);
        }
 
+       /*
+       We must make sure both vtable and max_length are globaly visible before returning to managed land.
+       */
+       mono_mb_emit_byte ((mb), MONO_CUSTOM_PREFIX);
+       mono_mb_emit_op (mb, CEE_MONO_MEMORY_BARRIER, StoreStoreBarrier);
+
        /* return p */
        mono_mb_emit_ldloc (mb, p_var);
        mono_mb_emit_byte (mb, CEE_RET);
@@ -7726,7 +7232,7 @@ mono_gc_get_gc_name (void)
 static MonoMethod* alloc_method_cache [ATYPE_NUM];
 static MonoMethod *write_barrier_method;
 
-gboolean
+static gboolean
 mono_gc_is_critical_method (MonoMethod *method)
 {
        int i;
@@ -7859,6 +7365,67 @@ mono_gc_get_managed_allocator_types (void)
        return ATYPE_NUM;
 }
 
+static void
+emit_nursery_check (MonoMethodBuilder *mb, int *nursery_check_return_labels)
+{
+       memset (nursery_check_return_labels, 0, sizeof (int) * 3);
+#ifdef SGEN_ALIGN_NURSERY
+       // if (ptr_in_nursery (ptr)) return;
+       /*
+        * Masking out the bits might be faster, but we would have to use 64 bit
+        * immediates, which might be slower.
+        */
+       mono_mb_emit_ldarg (mb, 0);
+       mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
+       mono_mb_emit_byte (mb, CEE_SHR_UN);
+       mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
+       nursery_check_return_labels [0] = mono_mb_emit_branch (mb, CEE_BEQ);
+
+       // if (!ptr_in_nursery (*ptr)) return;
+       mono_mb_emit_ldarg (mb, 0);
+       mono_mb_emit_byte (mb, CEE_LDIND_I);
+       mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
+       mono_mb_emit_byte (mb, CEE_SHR_UN);
+       mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
+       nursery_check_return_labels [1] = mono_mb_emit_branch (mb, CEE_BNE_UN);
+#else
+       int label_continue1, label_continue2;
+       int dereferenced_var;
+
+       // if (ptr < (nursery_start)) goto continue;
+       mono_mb_emit_ldarg (mb, 0);
+       mono_mb_emit_ptr (mb, (gpointer) nursery_start);
+       label_continue_1 = mono_mb_emit_branch (mb, CEE_BLT);
+
+       // if (ptr >= nursery_end)) goto continue;
+       mono_mb_emit_ldarg (mb, 0);
+       mono_mb_emit_ptr (mb, (gpointer) nursery_end);
+       label_continue_2 = mono_mb_emit_branch (mb, CEE_BGE);
+
+       // Otherwise return
+       nursery_check_return_labels [0] = mono_mb_emit_branch (mb, CEE_BR);
+
+       // continue:
+       mono_mb_patch_branch (mb, label_continue_1);
+       mono_mb_patch_branch (mb, label_continue_2);
+
+       // Dereference and store in local var
+       dereferenced_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
+       mono_mb_emit_ldarg (mb, 0);
+       mono_mb_emit_byte (mb, CEE_LDIND_I);
+       mono_mb_emit_stloc (mb, dereferenced_var);
+
+       // if (*ptr < nursery_start) return;
+       mono_mb_emit_ldloc (mb, dereferenced_var);
+       mono_mb_emit_ptr (mb, (gpointer) nursery_start);
+       nursery_check_return_labels [1] = mono_mb_emit_branch (mb, CEE_BLT);
+
+       // if (*ptr >= nursery_end) return;
+       mono_mb_emit_ldloc (mb, dereferenced_var);
+       mono_mb_emit_ptr (mb, (gpointer) nursery_end);
+       nursery_check_return_labels [2] = mono_mb_emit_branch (mb, CEE_BGE);
+#endif 
+}
 
 MonoMethod*
 mono_gc_get_write_barrier (void)
@@ -7867,12 +7434,10 @@ mono_gc_get_write_barrier (void)
        MonoMethodBuilder *mb;
        MonoMethodSignature *sig;
 #ifdef MANAGED_WBARRIER
-       int label_no_wb_1, label_no_wb_2, label_no_wb_3, label_no_wb_4, label_need_wb, label_slow_path;
-#ifndef SGEN_ALIGN_NURSERY
-       int label_continue_1, label_continue_2, label_no_wb_5;
-       int dereferenced_var;
-#endif
+       int i, nursery_check_labels [3];
+       int label_no_wb_3, label_no_wb_4, label_need_wb, label_slow_path;
        int buffer_var, buffer_index_var, dummy_var;
+       gboolean use_managed_barrier;
 
 #ifdef HAVE_KW_THREAD
        int stack_end_offset = -1, store_remset_buffer_offset = -1;
@@ -7889,8 +7454,6 @@ mono_gc_get_write_barrier (void)
 #endif
 #endif
 
-       g_assert (!use_cardtable);
-
        // FIXME: Maybe create a separate version for ctors (the branch would be
        // correctly predicted more times)
        if (write_barrier_method)
@@ -7904,62 +7467,51 @@ mono_gc_get_write_barrier (void)
        mb = mono_mb_new (mono_defaults.object_class, "wbarrier", MONO_WRAPPER_WRITE_BARRIER);
 
 #ifdef MANAGED_WBARRIER
-       if (mono_runtime_has_tls_get ()) {
-#ifdef SGEN_ALIGN_NURSERY
-               // if (ptr_in_nursery (ptr)) return;
-               /*
-                * Masking out the bits might be faster, but we would have to use 64 bit
-                * immediates, which might be slower.
-                */
-               mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
-               mono_mb_emit_byte (mb, CEE_SHR_UN);
-               mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
-               label_no_wb_1 = mono_mb_emit_branch (mb, CEE_BEQ);
+       use_managed_barrier = TRUE;
+#endif
 
-               // if (!ptr_in_nursery (*ptr)) return;
+       if (use_managed_barrier && use_cardtable) {
+               emit_nursery_check (mb, nursery_check_labels);
+               /*
+               addr = sgen_cardtable + ((address >> CARD_BITS) & CARD_MASK)
+               *addr = 1;
+
+               sgen_cardtable: 
+                       LDC_PTR sgen_cardtable
+
+               address >> CARD_BITS
+                       LDARG_0
+                       LDC_I4 CARD_BITS
+                       SHR_UN
+               if (SGEN_HAVE_OVERLAPPING_CARDS) {
+                       LDC_PTR card_table_mask
+                       AND
+               }
+               AND
+               ldc_i4_1
+               stind_i1
+               */
+               mono_mb_emit_ptr (mb, sgen_cardtable);
                mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_byte (mb, CEE_LDIND_I);
-               mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
+               mono_mb_emit_icon (mb, CARD_BITS);
                mono_mb_emit_byte (mb, CEE_SHR_UN);
-               mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
-               label_no_wb_2 = mono_mb_emit_branch (mb, CEE_BNE_UN);
-#else
-
-               // if (ptr < (nursery_start)) goto continue;
-               mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_start);
-               label_continue_1 = mono_mb_emit_branch (mb, CEE_BLT);
-
-               // if (ptr >= nursery_end)) goto continue;
-               mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_end);
-               label_continue_2 = mono_mb_emit_branch (mb, CEE_BGE);
-
-               // Otherwise return
-               label_no_wb_1 = mono_mb_emit_branch (mb, CEE_BR);
-
-               // continue:
-               mono_mb_patch_branch (mb, label_continue_1);
-               mono_mb_patch_branch (mb, label_continue_2);
-
-               // Dereference and store in local var
-               dereferenced_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
-               mono_mb_emit_ldarg (mb, 0);
-               mono_mb_emit_byte (mb, CEE_LDIND_I);
-               mono_mb_emit_stloc (mb, dereferenced_var);
-
-               // if (*ptr < nursery_start) return;
-               mono_mb_emit_ldloc (mb, dereferenced_var);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_start);
-               label_no_wb_2 = mono_mb_emit_branch (mb, CEE_BLT);
+#ifdef SGEN_HAVE_OVERLAPPING_CARDS
+               mono_mb_emit_ptr (mb, (gpointer)CARD_MASK);
+               mono_mb_emit_byte (mb, CEE_AND);
+#endif
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_icon (mb, 1);
+               mono_mb_emit_byte (mb, CEE_STIND_I1);
 
-               // if (*ptr >= nursery_end) return;
-               mono_mb_emit_ldloc (mb, dereferenced_var);
-               mono_mb_emit_ptr (mb, (gpointer) nursery_end);
-               label_no_wb_5 = mono_mb_emit_branch (mb, CEE_BGE);
+               // return;
+               for (i = 0; i < 3; ++i) {
+                       if (nursery_check_labels [i])
+                               mono_mb_patch_branch (mb, nursery_check_labels [i]);
+               }               
+               mono_mb_emit_byte (mb, CEE_RET);
+       } else if (use_managed_barrier && mono_runtime_has_tls_get ()) {
+               emit_nursery_check (mb, nursery_check_labels);
 
-#endif 
                // if (ptr >= stack_end) goto need_wb;
                mono_mb_emit_ldarg (mb, 0);
                EMIT_TLS_ACCESS (mb, stack_end, stack_end_offset);
@@ -8022,23 +7574,26 @@ mono_gc_get_write_barrier (void)
                mono_mb_emit_byte (mb, CEE_STIND_I);
 
                // return;
-               mono_mb_patch_branch (mb, label_no_wb_1);
-               mono_mb_patch_branch (mb, label_no_wb_2);
+               for (i = 0; i < 3; ++i) {
+                       if (nursery_check_labels [i])
+                               mono_mb_patch_branch (mb, nursery_check_labels [i]);
+               }
                mono_mb_patch_branch (mb, label_no_wb_3);
                mono_mb_patch_branch (mb, label_no_wb_4);
-#ifndef SGEN_ALIGN_NURSERY
-               mono_mb_patch_branch (mb, label_no_wb_5);
-#endif
                mono_mb_emit_byte (mb, CEE_RET);
 
                // slow path
                mono_mb_patch_branch (mb, label_slow_path);
+
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_icall (mb, mono_gc_wbarrier_generic_nostore);
+               mono_mb_emit_byte (mb, CEE_RET);
+       } else {
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_icall (mb, mono_gc_wbarrier_generic_nostore);
+               mono_mb_emit_byte (mb, CEE_RET);                
        }
-#endif
 
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_icall (mb, mono_gc_wbarrier_generic_nostore);
-       mono_mb_emit_byte (mb, CEE_RET);
 
        res = mono_mb_create_method (mb, sig, 16);
        mono_mb_free (mb);
@@ -8106,4 +7661,16 @@ BOOL APIENTRY mono_gc_dllmain (HMODULE module_handle, DWORD reason, LPVOID reser
 }
 #endif
 
+NurseryClearPolicy
+mono_sgen_get_nursery_clear_policy (void)
+{
+       return nursery_clear_policy;
+}
+
+MonoVTable*
+mono_sgen_get_array_fill_vtable (void)
+{
+       return array_fill_vtable;
+}
+
 #endif /* HAVE_SGEN_GC */