Fix the build.
[mono.git] / mono / metadata / sgen-gc.c
index dc73fcc0a0e9c28b68c3b30e867e5819994a1466..1baa5ca3ee857c52623176cd9d0cd6db4449fb24 100644 (file)
 #include <signal.h>
 #include <errno.h>
 #include <assert.h>
-#include <sys/types.h>
-#include <sys/stat.h>
-#include <sys/mman.h>
-#include <sys/time.h>
-#include <time.h>
-#include <fcntl.h>
 #include "metadata/metadata-internals.h"
 #include "metadata/class-internals.h"
 #include "metadata/gc-internal.h"
 #include "metadata/object-internals.h"
 #include "metadata/threads.h"
 #include "metadata/sgen-gc.h"
+#include "metadata/sgen-archdep.h"
 #include "metadata/mono-gc.h"
 #include "metadata/method-builder.h"
 #include "metadata/profiler-private.h"
+#include "metadata/monitor.h"
+#include "metadata/threadpool-internals.h"
+#include "metadata/mempool-internals.h"
+#include "metadata/marshal.h"
 #include "utils/mono-mmap.h"
+#include "utils/mono-time.h"
+#include "utils/mono-semaphore.h"
+#include "utils/mono-counters.h"
 
-#ifdef HAVE_VALGRIND_MEMCHECK_H
-#include <valgrind/memcheck.h>
-#endif
+#include <mono/utils/memcheck.h>
 
 #define OPDEF(a,b,c,d,e,f,g,h,i,j) \
        a = i,
@@ -180,6 +180,80 @@ static FILE* gc_debug_file;
 static gboolean collect_before_allocs = FALSE;
 /* If set, do a heap consistency check before each minor collection */
 static gboolean consistency_check_at_minor_collection = FALSE;
+/* If set, check that there are no references to the domain left at domain unload */
+static gboolean xdomain_checks = FALSE;
+/* If not null, dump the heap after each collection into this file */
+static FILE *heap_dump_file = NULL;
+/* If set, mark stacks conservatively, even if precise marking is possible */
+static gboolean conservative_stack_mark = FALSE;
+
+/*
+ * Turning on heavy statistics will turn off the managed allocator and
+ * the managed write barrier.
+ */
+//#define HEAVY_STATISTICS
+
+#ifdef HEAVY_STATISTICS
+#define HEAVY_STAT(x)  x
+#else
+#define HEAVY_STAT(x)
+#endif
+
+#ifdef HEAVY_STATISTICS
+static long stat_objects_alloced = 0;
+static long stat_copy_object_called_nursery = 0;
+static long stat_objects_copied_nursery = 0;
+static long stat_copy_object_called_major = 0;
+static long stat_objects_copied_major = 0;
+
+static long stat_copy_object_failed_from_space = 0;
+static long stat_copy_object_failed_forwarded = 0;
+static long stat_copy_object_failed_pinned = 0;
+static long stat_copy_object_failed_large_pinned = 0;
+static long stat_copy_object_failed_to_space = 0;
+
+static long stat_store_remsets = 0;
+static long stat_store_remsets_unique = 0;
+static long stat_saved_remsets_1 = 0;
+static long stat_saved_remsets_2 = 0;
+static long stat_global_remsets_added = 0;
+static long stat_global_remsets_processed = 0;
+
+static long num_copy_object_called = 0;
+static long num_objects_copied = 0;
+
+static int stat_wbarrier_set_field = 0;
+static int stat_wbarrier_set_arrayref = 0;
+static int stat_wbarrier_arrayref_copy = 0;
+static int stat_wbarrier_generic_store = 0;
+static int stat_wbarrier_generic_store_remset = 0;
+static int stat_wbarrier_set_root = 0;
+static int stat_wbarrier_value_copy = 0;
+static int stat_wbarrier_object_copy = 0;
+#endif
+
+static long pinned_chunk_bytes_alloced = 0;
+static long large_internal_bytes_alloced = 0;
+
+enum {
+       INTERNAL_MEM_PIN_QUEUE,
+       INTERNAL_MEM_FRAGMENT,
+       INTERNAL_MEM_SECTION,
+       INTERNAL_MEM_SCAN_STARTS,
+       INTERNAL_MEM_FIN_TABLE,
+       INTERNAL_MEM_FINALIZE_ENTRY,
+       INTERNAL_MEM_DISLINK_TABLE,
+       INTERNAL_MEM_DISLINK,
+       INTERNAL_MEM_ROOTS_TABLE,
+       INTERNAL_MEM_ROOT_RECORD,
+       INTERNAL_MEM_STATISTICS,
+       INTERNAL_MEM_REMSET,
+       INTERNAL_MEM_GRAY_QUEUE,
+       INTERNAL_MEM_STORE_REMSET,
+       INTERNAL_MEM_MAX
+};
+
+static long small_internal_mem_bytes [INTERNAL_MEM_MAX];
 
 /*
 void
@@ -192,24 +266,30 @@ mono_gc_flush_info (void)
 #define MAX_DEBUG_LEVEL 8
 #define DEBUG(level,a) do {if (G_UNLIKELY ((level) <= MAX_DEBUG_LEVEL && (level) <= gc_debug_level)) a;} while (0)
 
-#define TV_DECLARE(name) struct timeval name
-#define TV_GETTIME(tv) gettimeofday (&(tv), NULL)
-#define TV_ELAPSED(start,end) (int)((((end).tv_sec - (start).tv_sec) * 1000000) + end.tv_usec - start.tv_usec)
+#define TV_DECLARE(name) gint64 name
+#define TV_GETTIME(tv) tv = mono_100ns_ticks ()
+#define TV_ELAPSED(start,end) (int)((end-start) / 10)
+
+#define ALIGN_TO(val,align) ((((guint64)val) + ((align) - 1)) & ~((align) - 1))
 
 #define GC_BITS_PER_WORD (sizeof (mword) * 8)
 
 enum {
        MEMORY_ROLE_GEN0,
        MEMORY_ROLE_GEN1,
-       MEMORY_ROLE_GEN2,
-       MEMORY_ROLE_FIXED,
-       MEMORY_ROLE_INTERNAL
+       MEMORY_ROLE_PINNED
+};
+
+typedef struct _Block Block;
+struct _Block {
+       void *next;
+       unsigned char role;
 };
 
 /* each request from the OS ends up in a GCMemSection */
 typedef struct _GCMemSection GCMemSection;
 struct _GCMemSection {
-       GCMemSection *next;
+       Block block;
        char *data;
        mword size;
        /* pointer where more data could be allocated if it fits */
@@ -224,9 +304,11 @@ struct _GCMemSection {
        int pin_queue_start;
        int pin_queue_end;
        unsigned short num_scan_start;
-       unsigned char role;
+       gboolean is_to_space;
 };
 
+#define SIZEOF_GC_MEM_SECTION  ((sizeof (GCMemSection) + 7) & ~7)
+
 /* large object space struct: 64+ KB */
 /* we could make this limit much smaller to avoid memcpy copy
  * and potentially have more room in the GC descriptor: need to measure
@@ -287,7 +369,7 @@ struct _LOSObject {
 #define PINNED_CHUNK_MIN_SIZE (4096*8)
 typedef struct _PinnedChunk PinnedChunk;
 struct _PinnedChunk {
-       PinnedChunk *next;
+       Block block;
        int num_pages;
        int *page_sizes; /* a 0 means the page is still unused */
        void **free_list;
@@ -360,6 +442,21 @@ struct _RememberedSet {
        mword data [MONO_ZERO_LEN_ARRAY];
 };
 
+/*
+ * We're never actually using the first element.  It's always set to
+ * NULL to simplify the elimination of consecutive duplicate
+ * entries.
+ */
+#define STORE_REMSET_BUFFER_SIZE       1024
+
+typedef struct _GenericStoreRememberedSet GenericStoreRememberedSet;
+struct _GenericStoreRememberedSet {
+       GenericStoreRememberedSet *next;
+       /* We need one entry less because the first entry of store
+          remset buffers is always a dummy and we don't copy it. */
+       gpointer data [STORE_REMSET_BUFFER_SIZE - 1];
+};
+
 /* we have 4 possible values in the low 2 bits */
 enum {
        REMSET_LOCATION, /* just a pointer to the exact location */
@@ -371,14 +468,18 @@ enum {
 
 /* Subtypes of REMSET_OTHER */
 enum {
-       REMSET_VTYPE, /* a valuetype described by a gc descriptor */
+       REMSET_VTYPE, /* a valuetype array described by a gc descriptor and a count */
        REMSET_ROOT_LOCATION, /* a location inside a root */
 };
 
+#ifdef HAVE_KW_THREAD
 static __thread RememberedSet *remembered_set MONO_TLS_FAST;
+#endif
 static pthread_key_t remembered_set_key;
 static RememberedSet *global_remset;
+static RememberedSet *freed_thread_remsets;
 //static int store_to_global_remset = 0;
+static GenericStoreRememberedSet *generic_store_remsets = NULL;
 
 /* FIXME: later choose a size that takes into account the RememberedSet struct
  * and doesn't waste any alloc paddin space.
@@ -452,8 +553,8 @@ safe_object_get_size (MonoObject* o)
                MonoArray *array = (MonoArray*)o;
                size_t size = sizeof (MonoArray) + mono_array_element_size (klass) * mono_array_length (array);
                if (G_UNLIKELY (array->bounds)) {
-                       size += 3;
-                       size &= ~3;
+                       size += sizeof (mono_array_size_t) - 1;
+                       size &= ~(sizeof (mono_array_size_t) - 1);
                        size += sizeof (MonoArrayBounds) * klass->rank;
                }
                return size;
@@ -463,19 +564,6 @@ safe_object_get_size (MonoObject* o)
        }
 }
 
-static inline gboolean
-is_maybe_half_constructed (MonoObject *o)
-{
-       MonoClass *klass;
-
-       klass = ((MonoVTable*)LOAD_VTABLE (o))->klass;
-       if ((klass == mono_defaults.string_class && mono_string_length ((MonoString*)o) == 0) ||
-               (klass->rank && mono_array_length ((MonoArray*)o) == 0))
-               return TRUE;
-       else
-               return FALSE;
-}
-
 /*
  * ######################################################################
  * ########  Global data.
@@ -491,7 +579,10 @@ static int num_major_gcs = 0;
 #define DEFAULT_NURSERY_SIZE (1024*512*2)
 /* The number of trailing 0 bits in DEFAULT_NURSERY_SIZE */
 #define DEFAULT_NURSERY_BITS 20
-#define DEFAULT_MAX_SECTION (DEFAULT_NURSERY_SIZE * 16)
+#define MAJOR_SECTION_SIZE     (128*1024)
+#define BLOCK_FOR_OBJECT(o)            ((Block*)(((mword)(o)) & ~(MAJOR_SECTION_SIZE - 1)))
+#define MAJOR_SECTION_FOR_OBJECT(o)    ((GCMemSection*)BLOCK_FOR_OBJECT ((o)))
+#define MIN_MINOR_COLLECTION_SECTION_ALLOWANCE (DEFAULT_NURSERY_SIZE * 3 / MAJOR_SECTION_SIZE)
 #define DEFAULT_LOS_COLLECTION_TARGET (DEFAULT_NURSERY_SIZE * 2)
 /* to quickly find the head of an object pinned by a conservative address
  * we keep track of the objects allocated for each SCAN_START_SIZE memory
@@ -506,11 +597,12 @@ static int num_major_gcs = 0;
 
 static mword pagesize = 4096;
 static mword nursery_size = DEFAULT_NURSERY_SIZE;
-static mword next_section_size = DEFAULT_NURSERY_SIZE * 4;
-static mword max_section_size = DEFAULT_MAX_SECTION;
-static int section_size_used = 0;
 static int degraded_mode = 0;
 
+static int minor_collection_section_allowance = MIN_MINOR_COLLECTION_SECTION_ALLOWANCE;
+static int minor_collection_sections_alloced = 0;
+static int num_major_sections = 0;
+
 static LOSObject *los_object_list = NULL;
 static mword los_memory_usage = 0;
 static mword los_num_objects = 0;
@@ -530,12 +622,41 @@ struct _FinalizeEntry {
        void *object;
 };
 
+typedef struct _FinalizeEntryHashTable FinalizeEntryHashTable;
+struct _FinalizeEntryHashTable {
+       FinalizeEntry **table;
+       mword size;
+       int num_registered;
+};
+
 typedef struct _DisappearingLink DisappearingLink;
 struct _DisappearingLink {
        DisappearingLink *next;
        void **link;
 };
 
+typedef struct _DisappearingLinkHashTable DisappearingLinkHashTable;
+struct _DisappearingLinkHashTable {
+       DisappearingLink **table;
+       mword size;
+       int num_links;
+};
+
+#define LARGE_INTERNAL_MEM_HEADER_MAGIC        0x7d289f3a
+
+typedef struct _LargeInternalMemHeader LargeInternalMemHeader;
+struct _LargeInternalMemHeader {
+       guint32 magic;
+       size_t size;
+       double data[0];
+};
+
+enum {
+       GENERATION_NURSERY,
+       GENERATION_OLD,
+       GENERATION_MAX
+};
+
 /*
  * The link pointer is hidden by negating each bit.  We use the lowest
  * bit of the link (before negation) to store whether it needs
@@ -551,17 +672,16 @@ struct _DisappearingLink {
  * The finalizable hash has the object as the key, the 
  * disappearing_link hash, has the link address as key.
  */
-static FinalizeEntry **finalizable_hash = NULL;
+static FinalizeEntryHashTable minor_finalizable_hash;
+static FinalizeEntryHashTable major_finalizable_hash;
 /* objects that are ready to be finalized */
 static FinalizeEntry *fin_ready_list = NULL;
 static FinalizeEntry *critical_fin_list = NULL;
-static DisappearingLink **disappearing_link_hash = NULL;
-static mword disappearing_link_hash_size = 0;
-static mword finalizable_hash_size = 0;
 
-static int num_registered_finalizers = 0;
+static DisappearingLinkHashTable minor_disappearing_link_hash;
+static DisappearingLinkHashTable major_disappearing_link_hash;
+
 static int num_ready_finalizers = 0;
-static int num_disappearing_links = 0;
 static int no_finalize = 0;
 
 /* keep each size a multiple of ALLOC_ALIGN */
@@ -582,12 +702,7 @@ static PinnedChunk *internal_chunk_list = NULL;
 static gboolean
 obj_is_from_pinned_alloc (char *p)
 {
-       PinnedChunk *chunk = pinned_chunk_list;
-       for (; chunk; chunk = chunk->next) {
-               if (p >= (char*)chunk->start_data && p < ((char*)chunk + chunk->num_pages * FREELIST_PAGESIZE))
-                       return TRUE;
-       }
-       return FALSE;
+       return BLOCK_FOR_OBJECT (p)->role == MEMORY_ROLE_PINNED;
 }
 
 static int slot_for_size (size_t size);
@@ -639,16 +754,76 @@ static int num_roots_entries [ROOT_TYPE_NUM] = { 0, 0, 0 };
  */
 static char *nursery_start = NULL;
 
+/* eventually share with MonoThread? */
+typedef struct _SgenThreadInfo SgenThreadInfo;
+
+struct _SgenThreadInfo {
+       SgenThreadInfo *next;
+       ARCH_THREAD_TYPE id;
+       unsigned int stop_count; /* to catch duplicate signals */
+       int signal;
+       int skip;
+       void *stack_end;
+       void *stack_start;
+       void *stack_start_limit;
+       char **tlab_next_addr;
+       char **tlab_start_addr;
+       char **tlab_temp_end_addr;
+       char **tlab_real_end_addr;
+       gpointer **store_remset_buffer_addr;
+       long *store_remset_buffer_index_addr;
+       RememberedSet *remset;
+       gpointer runtime_data;
+       gpointer stopped_ip;    /* only valid if the thread is stopped */
+       MonoDomain *stopped_domain; /* ditto */
+       gpointer *stopped_regs;     /* ditto */
+#ifndef HAVE_KW_THREAD
+       char *tlab_start;
+       char *tlab_next;
+       char *tlab_temp_end;
+       char *tlab_real_end;
+       gpointer *store_remset_buffer;
+       long store_remset_buffer_index;
+#endif
+};
+
+#ifdef HAVE_KW_THREAD
+#define TLAB_ACCESS_INIT
+#define TLAB_START     tlab_start
+#define TLAB_NEXT      tlab_next
+#define TLAB_TEMP_END  tlab_temp_end
+#define TLAB_REAL_END  tlab_real_end
+#define REMEMBERED_SET remembered_set
+#define STORE_REMSET_BUFFER    store_remset_buffer
+#define STORE_REMSET_BUFFER_INDEX      store_remset_buffer_index
+#else
+static pthread_key_t thread_info_key;
+#define TLAB_ACCESS_INIT       SgenThreadInfo *__thread_info__ = pthread_getspecific (thread_info_key)
+#define TLAB_START     (__thread_info__->tlab_start)
+#define TLAB_NEXT      (__thread_info__->tlab_next)
+#define TLAB_TEMP_END  (__thread_info__->tlab_temp_end)
+#define TLAB_REAL_END  (__thread_info__->tlab_real_end)
+#define REMEMBERED_SET (__thread_info__->remset)
+#define STORE_REMSET_BUFFER    (__thread_info__->store_remset_buffer)
+#define STORE_REMSET_BUFFER_INDEX      (__thread_info__->store_remset_buffer_index)
+#endif
+
 /*
  * FIXME: What is faster, a TLS variable pointing to a structure, or separate TLS 
  * variables for next+temp_end ?
  */
+#ifdef HAVE_KW_THREAD
 static __thread char *tlab_start;
 static __thread char *tlab_next;
 static __thread char *tlab_temp_end;
 static __thread char *tlab_real_end;
-/* Used by the managed allocator */
+static __thread gpointer *store_remset_buffer;
+static __thread long store_remset_buffer_index;
+/* Used by the managed allocator/wbarrier */
 static __thread char **tlab_next_addr;
+static __thread char *stack_end;
+static __thread long *store_remset_buffer_index_addr;
+#endif
 static char *nursery_next = NULL;
 static char *nursery_frag_real_end = NULL;
 static char *nursery_real_end = NULL;
@@ -670,22 +845,13 @@ static Fragment *fragment_freelist = NULL;
 
 /* 
  * used when moving the objects
- * When the nursery is collected, objects are copied to to_space.
- * The area between gray_first and gray_objects is used as a stack
- * of objects that need their fields checked for more references
- * to be copied.
- * We should optimize somehow this mechanism to avoid rescanning
- * ptr-free objects. The order is also probably not optimal: need to
- * test cache misses and other graph traversal orders.
  */
-static char *to_space = NULL;
-static char *gray_first = NULL;
-static char *gray_objects = NULL;
-static char *to_space_end = NULL;
+static char *to_space_bumper = NULL;
+static char *to_space_top = NULL;
 static GCMemSection *to_space_section = NULL;
 
 /* objects bigger then this go into the large object space */
-#define MAX_SMALL_OBJ_SIZE 0xffff
+#define MAX_SMALL_OBJ_SIZE MAX_FREELIST_SIZE
 
 /* Functions supplied by the runtime to be called by the GC */
 static MonoGCCallbacks gc_callbacks;
@@ -702,6 +868,7 @@ static MonoGCCallbacks gc_callbacks;
                if ((mword)(high) > highest_heap_address)       \
                        highest_heap_address = (mword)(high);   \
        } while (0)
+#define ADDR_IN_HEAP_BOUNDARIES(addr) ((p) >= lowest_heap_address && (p) < highest_heap_address)
 
 inline static void*
 align_pointer (void *ptr)
@@ -713,8 +880,8 @@ align_pointer (void *ptr)
 }
 
 /* forward declarations */
-static void* get_internal_mem          (size_t size);
-static void  free_internal_mem         (void *addr);
+static void* get_internal_mem          (size_t size, int type);
+static void  free_internal_mem         (void *addr, int type);
 static void* get_os_memory             (size_t size, int activate);
 static void  free_os_memory            (void *addr, size_t size);
 static G_GNUC_UNUSED void  report_internal_mem_usage (void);
@@ -725,25 +892,31 @@ static void scan_thread_data (void *start_nursery, void *end_nursery, gboolean p
 static void scan_from_remsets (void *start_nursery, void *end_nursery);
 static void find_pinning_ref_from_thread (char *obj, size_t size);
 static void update_current_thread_stack (void *start);
-static GCMemSection* alloc_section (size_t size);
-static void finalize_in_range (char *start, char *end);
-static void null_link_in_range (char *start, char *end);
-static void null_links_for_domain (MonoDomain *domain);
+static GCMemSection* alloc_major_section (void);
+static void finalize_in_range (char *start, char *end, int generation);
+static void add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation);
+static void null_link_in_range (char *start, char *end, int generation);
+static void null_links_for_domain (MonoDomain *domain, int generation);
 static gboolean search_fragment_for_size (size_t size);
 static void mark_pinned_from_addresses (PinnedChunk *chunk, void **start, void **end);
 static void clear_remsets (void);
 static void clear_tlabs (void);
-static char *find_tlab_next_from_address (char *addr);
-static void scan_pinned_objects (void (*callback) (PinnedChunk*, char*, size_t, void*), void *callback_data);
+typedef void (*ScanPinnedObjectCallbackFunc) (PinnedChunk*, char*, size_t, void*);
+static void scan_pinned_objects (ScanPinnedObjectCallbackFunc callback, void *callback_data);
 static void sweep_pinned_objects (void);
 static void scan_from_pinned_objects (char *addr_start, char *addr_end);
 static void free_large_object (LOSObject *obj);
-static void free_mem_section (GCMemSection *section);
+static void free_major_section (GCMemSection *section);
+static void to_space_expand (void);
+
+static void mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track);
 
 void describe_ptr (char *ptr);
 void check_consistency (void);
 char* check_object (char *start);
 
+void mono_gc_scan_for_specific_ref (MonoObject *key);
+
 /*
  * ######################################################################
  * ########  GC descriptors
@@ -975,7 +1148,7 @@ mono_gc_get_bitmap_for_descr (void *descr, int *numbits)
        switch (d & 0x7) {
        case DESC_TYPE_RUN_LENGTH: {            
                int first_set = (d >> 16) & 0xff;
-               int num_set = (d >> 16) & 0xff;
+               int num_set = (d >> 24) & 0xff;
                int i;
 
                bitmap = g_new0 (gsize, (first_set + num_set + 7) / 8);
@@ -1171,6 +1344,29 @@ mono_gc_get_bitmap_for_descr (void *descr, int *numbits)
                }       \
        } while (0)
 
+#define COUNT_OBJECT_TYPES do {                                                \
+       switch (desc & 0x7) {                                           \
+       case DESC_TYPE_STRING: type_str++; break;                       \
+       case DESC_TYPE_RUN_LENGTH: type_rlen++; break;                  \
+       case DESC_TYPE_ARRAY: case DESC_TYPE_VECTOR: type_vector++; break; \
+       case DESC_TYPE_SMALL_BITMAP: type_bitmap++; break;              \
+       case DESC_TYPE_LARGE_BITMAP: type_lbit++; break;                \
+       case DESC_TYPE_COMPLEX: type_complex++; break;                  \
+       case DESC_TYPE_COMPLEX_ARR: type_complex++; break;              \
+       default: g_assert_not_reached ();                               \
+       }                                                               \
+       } while (0)
+
+
+/*
+ * ######################################################################
+ * ########  Detecting and removing garbage.
+ * ######################################################################
+ * This section of code deals with detecting the objects no longer in use
+ * and reclaiming the memory.
+ */
+
+#if 0
 static mword new_obj_references = 0;
 static mword obj_references_checked = 0;
 
@@ -1184,21 +1380,11 @@ static mword obj_references_checked = 0;
                }       \
        } while (0)
 
-/*
- * ######################################################################
- * ########  Detecting and removing garbage.
- * ######################################################################
- * This section of code deals with detecting the objects no longer in use
- * and reclaiming the memory.
- */
 static void __attribute__((noinline))
 scan_area (char *start, char *end)
 {
        GCVTable *vt;
-       size_t skip_size;
-       int type;
        int type_str = 0, type_rlen = 0, type_bitmap = 0, type_vector = 0, type_lbit = 0, type_complex = 0;
-       mword desc;
        new_obj_references = 0;
        obj_references_checked = 0;
        while (start < end) {
@@ -1212,85 +1398,276 @@ scan_area (char *start, char *end)
                        MonoObject *obj = (MonoObject*)start;
                        g_print ("found at %p (0x%zx): %s.%s\n", start, vt->desc, obj->vtable->klass->name_space, obj->vtable->klass->name);
                }
-               desc = vt->desc;
-               type = desc & 0x7;
-               if (type == DESC_TYPE_STRING) {
-                       STRING_SIZE (skip_size, start);
-                       start += skip_size;
-                       type_str++;
-                       continue;
-               } else if (type == DESC_TYPE_RUN_LENGTH) {
-                       OBJ_RUN_LEN_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       OBJ_RUN_LEN_FOREACH_PTR (desc,start);
-                       start += skip_size;
-                       type_rlen++;
-                       continue;
-               } else if (type == DESC_TYPE_VECTOR) { // includes ARRAY, too
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_VECTOR_FOREACH_PTR (vt, start);
-                       if (type == DESC_TYPE_ARRAY) {
-                               /* account for the bounds */
+
+#define SCAN_OBJECT_ACTION COUNT_OBJECT_TYPES
+#include "sgen-scan-object.h"
+       }
+       /*printf ("references to new nursery %p-%p (size: %dk): %d, checked: %d\n", old_start, end, (end-old_start)/1024, new_obj_references, obj_references_checked);
+       printf ("\tstrings: %d, runl: %d, vector: %d, bitmaps: %d, lbitmaps: %d, complex: %d\n",
+               type_str, type_rlen, type_vector, type_bitmap, type_lbit, type_complex);*/
+}
+#endif
+
+static gboolean
+is_xdomain_ref_allowed (gpointer *ptr, char *obj, MonoDomain *domain)
+{
+       MonoObject *o = (MonoObject*)(obj);
+       MonoObject *ref = (MonoObject*)*(ptr);
+       int offset = (char*)(ptr) - (char*)o;
+
+       if (o->vtable->klass == mono_defaults.thread_class && offset == G_STRUCT_OFFSET (MonoThread, internal_thread))
+               return TRUE;
+       if (o->vtable->klass == mono_defaults.internal_thread_class && offset == G_STRUCT_OFFSET (MonoInternalThread, current_appcontext))
+               return TRUE;
+       if (mono_class_has_parent (o->vtable->klass, mono_defaults.real_proxy_class) &&
+                       offset == G_STRUCT_OFFSET (MonoRealProxy, unwrapped_server))
+               return TRUE;
+       /* Thread.cached_culture_info */
+       if (!strcmp (ref->vtable->klass->name_space, "System.Globalization") &&
+                       !strcmp (ref->vtable->klass->name, "CultureInfo") &&
+                       !strcmp(o->vtable->klass->name_space, "System") &&
+                       !strcmp(o->vtable->klass->name, "Object[]"))
+               return TRUE;
+       /*
+        *  at System.IO.MemoryStream.InternalConstructor (byte[],int,int,bool,bool) [0x0004d] in /home/schani/Work/novell/trunk/mcs/class/corlib/System.IO/MemoryStream.cs:121
+        * at System.IO.MemoryStream..ctor (byte[]) [0x00017] in /home/schani/Work/novell/trunk/mcs/class/corlib/System.IO/MemoryStream.cs:81
+        * at (wrapper remoting-invoke-with-check) System.IO.MemoryStream..ctor (byte[]) <IL 0x00020, 0xffffffff>
+        * at System.Runtime.Remoting.Messaging.CADMethodCallMessage.GetArguments () [0x0000d] in /home/schani/Work/novell/trunk/mcs/class/corlib/System.Runtime.Remoting.Messaging/CADMessages.cs:327
+        * at System.Runtime.Remoting.Messaging.MethodCall..ctor (System.Runtime.Remoting.Messaging.CADMethodCallMessage) [0x00017] in /home/schani/Work/novell/trunk/mcs/class/corlib/System.Runtime.Remoting.Messaging/MethodCall.cs:87
+        * at System.AppDomain.ProcessMessageInDomain (byte[],System.Runtime.Remoting.Messaging.CADMethodCallMessage,byte[]&,System.Runtime.Remoting.Messaging.CADMethodReturnMessage&) [0x00018] in /home/schani/Work/novell/trunk/mcs/class/corlib/System/AppDomain.cs:1213
+        * at (wrapper remoting-invoke-with-check) System.AppDomain.ProcessMessageInDomain (byte[],System.Runtime.Remoting.Messaging.CADMethodCallMessage,byte[]&,System.Runtime.Remoting.Messaging.CADMethodReturnMessage&) <IL 0x0003d, 0xffffffff>
+        * at System.Runtime.Remoting.Channels.CrossAppDomainSink.ProcessMessageInDomain (byte[],System.Runtime.Remoting.Messaging.CADMethodCallMessage) [0x00008] in /home/schani/Work/novell/trunk/mcs/class/corlib/System.Runtime.Remoting.Channels/CrossAppDomainChannel.cs:198
+        * at (wrapper runtime-invoke) object.runtime_invoke_CrossAppDomainSink/ProcessMessageRes_object_object (object,intptr,intptr,intptr) <IL 0x0004c, 0xffffffff>
+        */
+       if (!strcmp (ref->vtable->klass->name_space, "System") &&
+                       !strcmp (ref->vtable->klass->name, "Byte[]") &&
+                       !strcmp (o->vtable->klass->name_space, "System.IO") &&
+                       !strcmp (o->vtable->klass->name, "MemoryStream"))
+               return TRUE;
+       /* append_job() in threadpool.c */
+       if (!strcmp (ref->vtable->klass->name_space, "System.Runtime.Remoting.Messaging") &&
+                       !strcmp (ref->vtable->klass->name, "AsyncResult") &&
+                       !strcmp (o->vtable->klass->name_space, "System") &&
+                       !strcmp (o->vtable->klass->name, "Object[]") &&
+                       mono_thread_pool_is_queue_array ((MonoArray*) o))
+               return TRUE;
+       return FALSE;
+}
+
+static void
+check_reference_for_xdomain (gpointer *ptr, char *obj, MonoDomain *domain)
+{
+       MonoObject *o = (MonoObject*)(obj);
+       MonoObject *ref = (MonoObject*)*(ptr);
+       int offset = (char*)(ptr) - (char*)o;
+       MonoClass *class;
+       MonoClassField *field;
+       char *str;
+
+       if (!ref || ref->vtable->domain == domain)
+               return;
+       if (is_xdomain_ref_allowed (ptr, obj, domain))
+               return;
+
+       field = NULL;
+       for (class = o->vtable->klass; class; class = class->parent) {
+               int i;
+
+               for (i = 0; i < class->field.count; ++i) {
+                       if (class->fields[i].offset == offset) {
+                               field = &class->fields[i];
+                               break;
                        }
-                       start += skip_size;
-                       type_vector++;
-                       continue;
-               } else if (type == DESC_TYPE_SMALL_BITMAP) {
-                       OBJ_BITMAP_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       OBJ_BITMAP_FOREACH_PTR (desc,start);
-                       start += skip_size;
-                       type_bitmap++;
-                       continue;
-               } else if (type == DESC_TYPE_LARGE_BITMAP) {
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_LARGE_BITMAP_FOREACH_PTR (vt,start);
-                       start += skip_size;
-                       type_lbit++;
+               }
+               if (field)
+                       break;
+       }
+
+       if (ref->vtable->klass == mono_defaults.string_class)
+               str = mono_string_to_utf8 ((MonoString*)ref);
+       else
+               str = NULL;
+       g_print ("xdomain reference in %p (%s.%s) at offset %d (%s) to %p (%s.%s) (%s)  -  pointed to by:\n",
+                       o, o->vtable->klass->name_space, o->vtable->klass->name,
+                       offset, field ? field->name : "",
+                       ref, ref->vtable->klass->name_space, ref->vtable->klass->name, str ? str : "");
+       mono_gc_scan_for_specific_ref (o);
+       if (str)
+               g_free (str);
+}
+
+#undef HANDLE_PTR
+#define HANDLE_PTR(ptr,obj)    check_reference_for_xdomain ((ptr), (obj), domain)
+
+static char*
+scan_object_for_xdomain_refs (char *start)
+{
+       MonoDomain *domain = ((MonoObject*)start)->vtable->domain;
+
+       #include "sgen-scan-object.h"
+
+       return start;
+}
+
+static void
+scan_area_for_xdomain_refs (char *start, char *end)
+{
+       while (start < end) {
+               if (!*(void**)start) {
+                       start += sizeof (void*); /* should be ALLOC_ALIGN, really */
                        continue;
-               } else if (type == DESC_TYPE_COMPLEX) {
-                       /* this is a complex object */
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_COMPLEX_FOREACH_PTR (vt, start);
-                       start += skip_size;
-                       type_complex++;
+               }
+
+               start = scan_object_for_xdomain_refs (start);
+       }
+}
+
+#undef HANDLE_PTR
+#define HANDLE_PTR(ptr,obj) do {               \
+       if ((MonoObject*)*(ptr) == key) {       \
+       g_print ("found ref to %p in object %p (%s) at offset %zd\n",   \
+                       key, (obj), safe_name ((obj)), ((char*)(ptr) - (char*)(obj))); \
+       }                                                               \
+       } while (0)
+
+static char*
+scan_object_for_specific_ref (char *start, MonoObject *key)
+{
+       #include "sgen-scan-object.h"
+
+       return start;
+}
+
+static void
+scan_area_for_specific_ref (char *start, char *end, MonoObject *key)
+{
+       while (start < end) {
+               if (!*(void**)start) {
+                       start += sizeof (void*); /* should be ALLOC_ALIGN, really */
                        continue;
-               } else if (type == DESC_TYPE_COMPLEX_ARR) {
-                       /* this is an array of complex structs */
-                       skip_size = mono_array_element_size (((MonoVTable*)vt)->klass);
-                       skip_size *= mono_array_length ((MonoArray*)start);
-                       skip_size += sizeof (MonoArray);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_COMPLEX_ARR_FOREACH_PTR (vt, start);
-                       if (type == DESC_TYPE_ARRAY) {
-                               /* account for the bounds */
+               }
+
+               start = scan_object_for_specific_ref (start, key);
+       }
+}
+
+static void
+scan_pinned_object_for_specific_ref_callback (PinnedChunk *chunk, char *obj, size_t size, MonoObject *key)
+{
+       scan_object_for_specific_ref (obj, key);
+}
+
+static void
+check_root_obj_specific_ref (RootRecord *root, MonoObject *key, MonoObject *obj)
+{
+       if (key != obj)
+               return;
+       g_print ("found ref to %p in root record %p\n", key, root);
+}
+
+static MonoObject *check_key = NULL;
+static RootRecord *check_root = NULL;
+
+static void*
+check_root_obj_specific_ref_from_marker (void *obj)
+{
+       check_root_obj_specific_ref (check_root, check_key, obj);
+       return obj;
+}
+
+static void
+scan_roots_for_specific_ref (MonoObject *key, int root_type)
+{
+       int i;
+       RootRecord *root;
+       check_key = key;
+       for (i = 0; i < roots_hash_size [root_type]; ++i) {
+               for (root = roots_hash [root_type][i]; root; root = root->next) {
+                       void **start_root = (void**)root->start_root;
+                       mword desc = root->root_desc;
+
+                       check_root = root;
+
+                       switch (desc & ROOT_DESC_TYPE_MASK) {
+                       case ROOT_DESC_BITMAP:
+                               desc >>= ROOT_DESC_TYPE_SHIFT;
+                               while (desc) {
+                                       if (desc & 1)
+                                               check_root_obj_specific_ref (root, key, *start_root);
+                                       desc >>= 1;
+                                       start_root++;
+                               }
+                               return;
+                       case ROOT_DESC_COMPLEX: {
+                               gsize *bitmap_data = complex_descriptors + (desc >> ROOT_DESC_TYPE_SHIFT);
+                               int bwords = (*bitmap_data) - 1;
+                               void **start_run = start_root;
+                               bitmap_data++;
+                               while (bwords-- > 0) {
+                                       gsize bmap = *bitmap_data++;
+                                       void **objptr = start_run;
+                                       while (bmap) {
+                                               if (bmap & 1)
+                                                       check_root_obj_specific_ref (root, key, *objptr);
+                                               bmap >>= 1;
+                                               ++objptr;
+                                       }
+                                       start_run += GC_BITS_PER_WORD;
+                               }
+                               break;
+                       }
+                       case ROOT_DESC_USER: {
+                               MonoGCMarkFunc marker = user_descriptors [desc >> ROOT_DESC_TYPE_SHIFT];
+                               marker (start_root, check_root_obj_specific_ref_from_marker);
+                               break;
+                       }
+                       case ROOT_DESC_RUN_LEN:
+                               g_assert_not_reached ();
+                       default:
+                               g_assert_not_reached ();
+                       }
+               }
+       }
+       check_key = NULL;
+       check_root = NULL;
+}
+
+void
+mono_gc_scan_for_specific_ref (MonoObject *key)
+{
+       GCMemSection *section;
+       LOSObject *bigobj;
+       RootRecord *root;
+       int i;
+
+       for (section = section_list; section; section = section->block.next)
+               scan_area_for_specific_ref (section->data, section->end_data, key);
+
+       for (bigobj = los_object_list; bigobj; bigobj = bigobj->next)
+               scan_object_for_specific_ref (bigobj->data, key);
+
+       scan_pinned_objects ((ScanPinnedObjectCallbackFunc)scan_pinned_object_for_specific_ref_callback, key);
+
+       scan_roots_for_specific_ref (key, ROOT_TYPE_NORMAL);
+       scan_roots_for_specific_ref (key, ROOT_TYPE_WBARRIER);
+
+       for (i = 0; i < roots_hash_size [ROOT_TYPE_PINNED]; ++i) {
+               for (root = roots_hash [ROOT_TYPE_PINNED][i]; root; root = root->next) {
+                       void **ptr = (void**)root->start_root;
+
+                       while (ptr < (void**)root->end_root) {
+                               check_root_obj_specific_ref (root, *ptr, key);
+                               ++ptr;
                        }
-                       start += skip_size;
-                       type_complex++;
-                       continue;
-               } else {
-                       g_assert (0);
                }
        }
-       /*printf ("references to new nursery %p-%p (size: %dk): %d, checked: %d\n", old_start, end, (end-old_start)/1024, new_obj_references, obj_references_checked);
-       printf ("\tstrings: %d, runl: %d, vector: %d, bitmaps: %d, lbitmaps: %d, complex: %d\n",
-               type_str, type_rlen, type_vector, type_bitmap, type_lbit, type_complex);*/
 }
 
 static gboolean
 need_remove_object_for_domain (char *start, MonoDomain *domain)
 {
-       GCVTable *vt = (GCVTable*)LOAD_VTABLE (start);
-       /* handle threads someway (maybe insert the root domain vtable?) */
-       if (mono_object_domain (start) == domain && vt->klass != mono_defaults.thread_class) {
-               DEBUG (1, fprintf (gc_debug_file, "Need to cleanup object %p, (%s)\n", start, safe_name (start)));
+       if (mono_object_domain (start) == domain) {
+               DEBUG (1, fprintf (gc_debug_file, "Need to cleanup object %p\n", start));
                return TRUE;
        }
        return FALSE;
@@ -1300,6 +1677,8 @@ static void
 process_object_for_domain_clearing (char *start, MonoDomain *domain)
 {
        GCVTable *vt = (GCVTable*)LOAD_VTABLE (start);
+       if (vt->klass == mono_defaults.internal_thread_class)
+               g_assert (mono_object_domain (start) == mono_get_root_domain ());
        /* The object could be a proxy for an object in the domain
           we're deleting. */
        if (mono_class_has_parent (vt->klass, mono_defaults.real_proxy_class)) {
@@ -1308,8 +1687,8 @@ process_object_for_domain_clearing (char *start, MonoDomain *domain)
                /* The server could already have been zeroed out, so
                   we need to check for that, too. */
                if (server && (!LOAD_VTABLE (server) || mono_object_domain (server) == domain)) {
-                       DEBUG (1, fprintf (gc_debug_file, "Cleaning up remote pointer in %p to object %p (%s)\n",
-                                       start, server, LOAD_VTABLE (server) ? safe_name (server) : "null"));
+                       DEBUG (1, fprintf (gc_debug_file, "Cleaning up remote pointer in %p to object %p\n",
+                                       start, server));
                        ((MonoRealProxy*)start)->unwrapped_server = NULL;
                }
        }
@@ -1319,10 +1698,7 @@ static void __attribute__((noinline))
 scan_area_for_domain (MonoDomain *domain, char *start, char *end)
 {
        GCVTable *vt;
-       size_t skip_size;
-       int type;
        gboolean remove;
-       mword desc;
 
        while (start < end) {
                if (!*(void**)start) {
@@ -1332,81 +1708,121 @@ scan_area_for_domain (MonoDomain *domain, char *start, char *end)
                vt = (GCVTable*)LOAD_VTABLE (start);
                process_object_for_domain_clearing (start, domain);
                remove = need_remove_object_for_domain (start, domain);
-               desc = vt->desc;
-               type = desc & 0x7;
-               if (type == DESC_TYPE_STRING) {
-                       STRING_SIZE (skip_size, start);
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_RUN_LENGTH) {
-                       OBJ_RUN_LEN_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_VECTOR) { // includes ARRAY, too
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       if (type == DESC_TYPE_ARRAY) {
-                               /* account for the bounds */
-                       }
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_SMALL_BITMAP) {
-                       OBJ_BITMAP_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_LARGE_BITMAP) {
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_COMPLEX) {
-                       /* this is a complex object */
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else if (type == DESC_TYPE_COMPLEX_ARR) {
-                       /* this is an array of complex structs */
-                       skip_size = mono_array_element_size (((MonoVTable*)vt)->klass);
-                       skip_size *= mono_array_length ((MonoArray*)start);
-                       skip_size += sizeof (MonoArray);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       if (type == DESC_TYPE_ARRAY) {
-                               /* account for the bounds */
-                       }
-                       if (remove) memset (start, 0, skip_size);
-                       start += skip_size;
-                       continue;
-               } else {
-                       g_assert (0);
+               if (remove && ((MonoObject*)start)->synchronisation) {
+                       void **dislink = mono_monitor_get_object_monitor_weak_link ((MonoObject*)start);
+                       if (dislink)
+                               mono_gc_register_disappearing_link (NULL, dislink, FALSE);
                }
+
+#define SCAN_OBJECT_NOSCAN
+#define SCAN_OBJECT_ACTION do {                                                \
+                       if (remove) memset (start, 0, skip_size);       \
+               } while (0)
+#include "sgen-scan-object.h"
        }
 }
 
-static void
-clear_domain_process_pinned_object_callback (PinnedChunk *chunk, char *obj, size_t size, MonoDomain *domain)
+static MonoDomain *check_domain = NULL;
+
+static void*
+check_obj_not_in_domain (void *o)
 {
-       process_object_for_domain_clearing (obj, domain);
+       g_assert (((MonoObject*)o)->vtable->domain != check_domain);
+       return o;
 }
 
 static void
-clear_domain_free_pinned_object_callback (PinnedChunk *chunk, char *obj, size_t size, MonoDomain *domain)
+scan_for_registered_roots_in_domain (MonoDomain *domain, int root_type)
 {
-       if (need_remove_object_for_domain (obj, domain))
-               free_pinned_object (chunk, obj, size);
-}
+       int i;
+       RootRecord *root;
+       check_domain = domain;
+       for (i = 0; i < roots_hash_size [root_type]; ++i) {
+               for (root = roots_hash [root_type][i]; root; root = root->next) {
+                       void **start_root = (void**)root->start_root;
+                       mword desc = root->root_desc;
+
+                       /* The MonoDomain struct is allowed to hold
+                          references to objects in its own domain. */
+                       if (start_root == (void**)domain)
+                               continue;
+
+                       switch (desc & ROOT_DESC_TYPE_MASK) {
+                       case ROOT_DESC_BITMAP:
+                               desc >>= ROOT_DESC_TYPE_SHIFT;
+                               while (desc) {
+                                       if ((desc & 1) && *start_root)
+                                               check_obj_not_in_domain (*start_root);
+                                       desc >>= 1;
+                                       start_root++;
+                               }
+                               break;
+                       case ROOT_DESC_COMPLEX: {
+                               gsize *bitmap_data = complex_descriptors + (desc >> ROOT_DESC_TYPE_SHIFT);
+                               int bwords = (*bitmap_data) - 1;
+                               void **start_run = start_root;
+                               bitmap_data++;
+                               while (bwords-- > 0) {
+                                       gsize bmap = *bitmap_data++;
+                                       void **objptr = start_run;
+                                       while (bmap) {
+                                               if ((bmap & 1) && *objptr)
+                                                       check_obj_not_in_domain (*objptr);
+                                               bmap >>= 1;
+                                               ++objptr;
+                                       }
+                                       start_run += GC_BITS_PER_WORD;
+                               }
+                               break;
+                       }
+                       case ROOT_DESC_USER: {
+                               MonoGCMarkFunc marker = user_descriptors [desc >> ROOT_DESC_TYPE_SHIFT];
+                               marker (start_root, check_obj_not_in_domain);
+                               break;
+                       }
+                       case ROOT_DESC_RUN_LEN:
+                               g_assert_not_reached ();
+                       default:
+                               g_assert_not_reached ();
+                       }
+               }
+       }
+       check_domain = NULL;
+}
+
+static void
+clear_domain_process_pinned_object_callback (PinnedChunk *chunk, char *obj, size_t size, MonoDomain *domain)
+{
+       process_object_for_domain_clearing (obj, domain);
+}
+
+static void
+clear_domain_free_pinned_object_callback (PinnedChunk *chunk, char *obj, size_t size, MonoDomain *domain)
+{
+       if (need_remove_object_for_domain (obj, domain))
+               free_pinned_object (chunk, obj, size);
+}
+
+static void
+scan_pinned_object_for_xdomain_refs_callback (PinnedChunk *chunk, char *obj, size_t size, gpointer dummy)
+{
+       scan_object_for_xdomain_refs (obj);
+}
+
+static void
+check_for_xdomain_refs (void)
+{
+       GCMemSection *section;
+       LOSObject *bigobj;
+
+       for (section = section_list; section; section = section->block.next)
+               scan_area_for_xdomain_refs (section->data, section->end_data);
+
+       for (bigobj = los_object_list; bigobj; bigobj = bigobj->next)
+               scan_object_for_xdomain_refs (bigobj->data);
+
+       scan_pinned_objects (scan_pinned_object_for_xdomain_refs_callback, NULL);
+}
 
 /*
  * When appdomains are unloaded we can easily remove objects that have finalizers,
@@ -1423,6 +1839,7 @@ mono_gc_clear_domain (MonoDomain * domain)
        GCMemSection *section;
        LOSObject *bigobj, *prev;
        Fragment *frag;
+       int i;
 
        LOCK_GC;
        /* Clear all remaining nursery fragments */
@@ -1434,9 +1851,13 @@ mono_gc_clear_domain (MonoDomain * domain)
                }
        }
 
-       null_links_for_domain (domain);
+       if (xdomain_checks && domain != mono_get_root_domain ()) {
+               scan_for_registered_roots_in_domain (domain, ROOT_TYPE_NORMAL);
+               scan_for_registered_roots_in_domain (domain, ROOT_TYPE_WBARRIER);
+               check_for_xdomain_refs ();
+       }
 
-       for (section = section_list; section; section = section->next) {
+       for (section = section_list; section; section = section->block.next) {
                scan_area_for_domain (domain, section->data, section->end_data);
        }
 
@@ -1446,7 +1867,7 @@ mono_gc_clear_domain (MonoDomain * domain)
           (pinned objects), but we might need to dereference a
           pointer from an object to another object if the first
           object is a proxy. */
-       scan_pinned_objects (clear_domain_process_pinned_object_callback, domain);
+       scan_pinned_objects ((ScanPinnedObjectCallbackFunc)clear_domain_process_pinned_object_callback, domain);
        for (bigobj = los_object_list; bigobj; bigobj = bigobj->next)
                process_object_for_domain_clearing (bigobj->data, domain);
 
@@ -1459,15 +1880,18 @@ mono_gc_clear_domain (MonoDomain * domain)
                        else
                                los_object_list = bigobj->next;
                        bigobj = bigobj->next;
-                       DEBUG (1, fprintf (gc_debug_file, "Freeing large object %p (%s)\n",
-                                       bigobj->data, safe_name (bigobj->data)));
+                       DEBUG (1, fprintf (gc_debug_file, "Freeing large object %p\n",
+                                       bigobj->data));
                        free_large_object (to_free);
                        continue;
                }
                prev = bigobj;
                bigobj = bigobj->next;
        }
-       scan_pinned_objects (clear_domain_free_pinned_object_callback, domain);
+       scan_pinned_objects ((ScanPinnedObjectCallbackFunc)clear_domain_free_pinned_object_callback, domain);
+
+       for (i = GENERATION_NURSERY; i < GENERATION_MAX; ++i)
+               null_links_for_domain (domain, i);
 
        UNLOCK_GC;
 }
@@ -1485,6 +1909,11 @@ add_to_global_remset (gpointer ptr, gboolean root)
 
        DEBUG (8, fprintf (gc_debug_file, "Adding global remset for %p\n", ptr));
 
+       g_assert (!root);
+       g_assert (!ptr_in_nursery (ptr) && ptr_in_nursery (*(gpointer*)ptr));
+
+       HEAVY_STAT (++stat_global_remsets_added);
+
        /* 
         * FIXME: If an object remains pinned, we need to add it at every minor collection.
         * To avoid uncontrolled growth of the global remset, only add each pointer once.
@@ -1518,6 +1947,8 @@ add_to_global_remset (gpointer ptr, gboolean root)
        }
 }
 
+#include "sgen-gray.c"
+
 /*
  * This is how the copying happens from the nursery to the old generation.
  * We assume that at this time all the pinned objects have been identified and
@@ -1543,106 +1974,176 @@ copy_object (char *obj, char *from_space_start, char *from_space_end)
 {
        static void *copy_labels [] = { &&LAB_0, &&LAB_1, &&LAB_2, &&LAB_3, &&LAB_4, &&LAB_5, &&LAB_6, &&LAB_7, &&LAB_8 };
 
-       /* 
-        * FIXME: The second set of checks is only needed if we are called for tospace
-        * objects too.
+       char *forwarded;
+       mword objsize;
+       MonoVTable *vt;
+
+       HEAVY_STAT (++num_copy_object_called);
+
+       if (!(obj >= from_space_start && obj < from_space_end)) {
+               DEBUG (9, fprintf (gc_debug_file, "Not copying %p because it's not in from space (%p-%p)\n",
+                                               obj, from_space_start, from_space_end));
+               HEAVY_STAT (++stat_copy_object_failed_from_space);
+               return obj;
+       }
+
+       DEBUG (9, fprintf (gc_debug_file, "Precise copy of %p", obj));
+
+       /*
+        * obj must belong to one of:
+        *
+        * 1. the nursery
+        * 2. the LOS
+        * 3. a pinned chunk
+        * 4. a non-to-space section of the major heap
+        * 5. a to-space section of the major heap
+        *
+        * In addition, objects in 1, 2 and 4 might also be pinned.
+        * Objects in 1 and 4 might be forwarded.
+        *
+        * Before we can copy the object we must make sure that we are
+        * allowed to, i.e. that the object not pinned, not already
+        * forwarded and doesn't belong to the LOS, a pinned chunk, or
+        * a to-space section.
+        *
+        * We are usually called for to-space objects (5) when we have
+        * two remset entries for the same reference.  The first entry
+        * copies the object and updates the reference and the second
+        * calls us with the updated reference that points into
+        * to-space.  There might also be other circumstances where we
+        * get to-space objects.
         */
-       if (obj >= from_space_start && obj < from_space_end && (obj < to_space || obj >= to_space_end)) {
-               MonoVTable *vt;
-               char *forwarded;
-               mword objsize;
-               DEBUG (9, fprintf (gc_debug_file, "Precise copy of %p", obj));
-               if ((forwarded = object_is_forwarded (obj))) {
-                       g_assert (((MonoVTable*)LOAD_VTABLE(obj))->gc_descr);
-                       DEBUG (9, fprintf (gc_debug_file, " (already forwarded to %p)\n", forwarded));
-                       return forwarded;
-               }
-               if (object_is_pinned (obj)) {
-                       g_assert (((MonoVTable*)LOAD_VTABLE(obj))->gc_descr);
-                       DEBUG (9, fprintf (gc_debug_file, " (pinned, no change)\n"));
-                       return obj;
-               }
-               objsize = safe_object_get_size ((MonoObject*)obj);
-               objsize += ALLOC_ALIGN - 1;
-               objsize &= ~(ALLOC_ALIGN - 1);
-               DEBUG (9, fprintf (gc_debug_file, " (to %p, %s size: %zd)\n", gray_objects, ((MonoObject*)obj)->vtable->klass->name, objsize));
-               /* FIXME: handle pinned allocs:
-                * Large objects are simple, at least until we always follow the rule:
-                * if objsize >= MAX_SMALL_OBJ_SIZE, pin the object and return it.
-                * At the end of major collections, we walk the los list and if
-                * the object is pinned, it is marked, otherwise it can be freed.
-                */
-               if (G_UNLIKELY (objsize >= MAX_SMALL_OBJ_SIZE || (obj >= min_pinned_chunk_addr && obj < max_pinned_chunk_addr && obj_is_from_pinned_alloc (obj)))) {
-                       DEBUG (9, fprintf (gc_debug_file, "Marked LOS/Pinned %p (%s), size: %zd\n", obj, safe_name (obj), objsize));
-                       pin_object (obj);
-                       return obj;
-               }
-               /* ok, the object is not pinned, we can move it */
-               /* use a optimized memcpy here */
-               if (objsize <= sizeof (gpointer) * 8) {
-                       mword *dest = (mword*)gray_objects;
-                       goto *copy_labels [objsize / sizeof (gpointer)];
-               LAB_8:
-                       (dest) [7] = ((mword*)obj) [7];
-               LAB_7:
-                       (dest) [6] = ((mword*)obj) [6];
-               LAB_6:
-                       (dest) [5] = ((mword*)obj) [5];
-               LAB_5:
-                       (dest) [4] = ((mword*)obj) [4];
-               LAB_4:
-                       (dest) [3] = ((mword*)obj) [3];
-               LAB_3:
-                       (dest) [2] = ((mword*)obj) [2];
-               LAB_2:
-                       (dest) [1] = ((mword*)obj) [1];
-               LAB_1:
-                       (dest) [0] = ((mword*)obj) [0];
-               LAB_0:
-                       ;
-               } else {
+
+       if ((forwarded = object_is_forwarded (obj))) {
+               g_assert (((MonoVTable*)LOAD_VTABLE(obj))->gc_descr);
+               DEBUG (9, fprintf (gc_debug_file, " (already forwarded to %p)\n", forwarded));
+               HEAVY_STAT (++stat_copy_object_failed_forwarded);
+               return forwarded;
+       }
+       if (object_is_pinned (obj)) {
+               g_assert (((MonoVTable*)LOAD_VTABLE(obj))->gc_descr);
+               DEBUG (9, fprintf (gc_debug_file, " (pinned, no change)\n"));
+               HEAVY_STAT (++stat_copy_object_failed_pinned);
+               return obj;
+       }
+
+       objsize = safe_object_get_size ((MonoObject*)obj);
+       objsize += ALLOC_ALIGN - 1;
+       objsize &= ~(ALLOC_ALIGN - 1);
+
+       if (ptr_in_nursery (obj))
+               goto copy;
+
+       /*
+        * At this point we know obj is not pinned, not forwarded and
+        * belongs to 2, 3, 4, or 5.
+        *
+        * LOS object (2) are simple, at least until we always follow
+        * the rule: if objsize > MAX_SMALL_OBJ_SIZE, pin the object
+        * and return it.  At the end of major collections, we walk
+        * the los list and if the object is pinned, it is marked,
+        * otherwise it can be freed.
+        *
+        * Pinned chunks (3) and major heap sections (4, 5) both
+        * reside in blocks, which are always aligned, so once we've
+        * eliminated LOS objects, we can just access the block and
+        * see whether it's a pinned chunk or a major heap section.
+        */
+       if (G_UNLIKELY (objsize > MAX_SMALL_OBJ_SIZE || obj_is_from_pinned_alloc (obj))) {
+               DEBUG (9, fprintf (gc_debug_file, " (marked LOS/Pinned %p (%s), size: %zd)\n", obj, safe_name (obj), objsize));
+               pin_object (obj);
+               HEAVY_STAT (++stat_copy_object_failed_large_pinned);
+               return obj;
+       }
+
+       /*
+        * Now we know the object is in a major heap section.  All we
+        * need to do is check whether it's already in to-space (5) or
+        * not (4).
+        */
+       if (MAJOR_SECTION_FOR_OBJECT (obj)->is_to_space) {
+               g_assert (objsize <= MAX_SMALL_OBJ_SIZE);
+               DEBUG (9, fprintf (gc_debug_file, " (already copied)\n"));
+               HEAVY_STAT (++stat_copy_object_failed_to_space);
+               return obj;
+       }
+
+ copy:
+       DEBUG (9, fprintf (gc_debug_file, " (to %p, %s size: %zd)\n", to_space_bumper, ((MonoObject*)obj)->vtable->klass->name, objsize));
+
+       HEAVY_STAT (++num_objects_copied);
+
+       /* Make sure we have enough space available */
+       if (to_space_bumper + objsize > to_space_top) {
+               to_space_expand ();
+               g_assert (to_space_bumper + objsize <= to_space_top);
+       }
+
+       if (objsize <= sizeof (gpointer) * 8) {
+               mword *dest = (mword*)to_space_bumper;
+               goto *copy_labels [objsize / sizeof (gpointer)];
+       LAB_8:
+               (dest) [7] = ((mword*)obj) [7];
+       LAB_7:
+               (dest) [6] = ((mword*)obj) [6];
+       LAB_6:
+               (dest) [5] = ((mword*)obj) [5];
+       LAB_5:
+               (dest) [4] = ((mword*)obj) [4];
+       LAB_4:
+               (dest) [3] = ((mword*)obj) [3];
+       LAB_3:
+               (dest) [2] = ((mword*)obj) [2];
+       LAB_2:
+               (dest) [1] = ((mword*)obj) [1];
+       LAB_1:
+               (dest) [0] = ((mword*)obj) [0];
+       LAB_0:
+               ;
+       } else {
 #if 0
                {
                        int ecx;
                        char* esi = obj;
-                       char* edi = gray_objects;
+                       char* edi = to_space_bumper;
                        __asm__ __volatile__(
                                "rep; movsl"
                                : "=&c" (ecx), "=&D" (edi), "=&S" (esi)
                                : "0" (objsize/4), "1" (edi),"2" (esi)
                                : "memory"
-                       );
+                                            );
                }
 #else
-               memcpy (gray_objects, obj, objsize);
+               memcpy (to_space_bumper, obj, objsize);
 #endif
-               }
-               /* adjust array->bounds */
-               vt = ((MonoObject*)obj)->vtable;
-               g_assert (vt->gc_descr);
-               if (G_UNLIKELY (vt->rank && ((MonoArray*)obj)->bounds)) {
-                       MonoArray *array = (MonoArray*)gray_objects;
-                       array->bounds = (MonoArrayBounds*)((char*)gray_objects + ((char*)((MonoArray*)obj)->bounds - (char*)obj));
-                       DEBUG (9, fprintf (gc_debug_file, "Array instance %p: size: %zd, rank: %d, length: %d\n", array, objsize, vt->rank, mono_array_length (array)));
-               }
-               /* set the forwarding pointer */
-               forward_object (obj, gray_objects);
-               obj = gray_objects;
-               to_space_section->scan_starts [((char*)obj - (char*)to_space_section->data)/SCAN_START_SIZE] = obj;
-               gray_objects += objsize;
-               DEBUG (8, g_assert (gray_objects <= to_space_end));
-               return obj;
        }
+       /* adjust array->bounds */
+       vt = ((MonoObject*)obj)->vtable;
+       g_assert (vt->gc_descr);
+       if (G_UNLIKELY (vt->rank && ((MonoArray*)obj)->bounds)) {
+               MonoArray *array = (MonoArray*)to_space_bumper;
+               array->bounds = (MonoArrayBounds*)((char*)to_space_bumper + ((char*)((MonoArray*)obj)->bounds - (char*)obj));
+               DEBUG (9, fprintf (gc_debug_file, "Array instance %p: size: %zd, rank: %d, length: %d\n", array, objsize, vt->rank, mono_array_length (array)));
+       }
+       /* set the forwarding pointer */
+       forward_object (obj, to_space_bumper);
+       obj = to_space_bumper;
+       to_space_section->scan_starts [((char*)obj - (char*)to_space_section->data)/SCAN_START_SIZE] = obj;
+       to_space_bumper += objsize;
+       DEBUG (9, fprintf (gc_debug_file, "Enqueuing gray object %p (%s)\n", obj, safe_name (obj)));
+       gray_object_enqueue (obj);
+       DEBUG (8, g_assert (to_space_bumper <= to_space_top));
        return obj;
 }
 
 #undef HANDLE_PTR
 #define HANDLE_PTR(ptr,obj)    do {    \
                void *__old = *(ptr);   \
+               void *__copy;           \
                if (__old) {    \
-                       *(ptr) = copy_object (__old, from_start, from_end);     \
-                       DEBUG (9, if (__old != *(ptr)) fprintf (gc_debug_file, "Overwrote field at %p with %p (was: %p)\n", (ptr), *(ptr), __old));     \
-                       if (G_UNLIKELY (*(ptr) >= (void*)from_start && *(ptr) < (void*)from_end) && !ptr_in_nursery (ptr)) \
+                       *(ptr) = __copy = copy_object (__old, from_start, from_end);    \
+                       DEBUG (9, if (__old != __copy) fprintf (gc_debug_file, "Overwrote field at %p with %p (was: %p)\n", (ptr), *(ptr), __old));     \
+                       if (G_UNLIKELY (ptr_in_nursery (__copy) && !ptr_in_nursery ((ptr)))) \
                                add_to_global_remset ((ptr), FALSE);                                                    \
                }       \
        } while (0)
@@ -1656,75 +2157,9 @@ copy_object (char *obj, char *from_space_start, char *from_space_end)
 static char*
 scan_object (char *start, char* from_start, char* from_end)
 {
-       GCVTable *vt;
-       size_t skip_size;
-       mword desc;
-
-       vt = (GCVTable*)LOAD_VTABLE (start);
-       //type = vt->desc & 0x7;
+#include "sgen-scan-object.h"
 
-       /* gcc should be smart enough to remove the bounds check, but it isn't:( */
-       desc = vt->desc;
-       switch (desc & 0x7) {
-       //if (type == DESC_TYPE_STRING) {
-       case DESC_TYPE_STRING:
-               STRING_SIZE (skip_size, start);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_RUN_LENGTH) {
-       case DESC_TYPE_RUN_LENGTH:
-               OBJ_RUN_LEN_FOREACH_PTR (desc,start);
-               OBJ_RUN_LEN_SIZE (skip_size, desc, start);
-               g_assert (skip_size);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_VECTOR) { // includes ARRAY, too
-       case DESC_TYPE_ARRAY:
-       case DESC_TYPE_VECTOR:
-               OBJ_VECTOR_FOREACH_PTR (vt, start);
-               skip_size = safe_object_get_size ((MonoObject*)start);
-#if 0
-               skip_size = (vt->desc >> LOW_TYPE_BITS) & MAX_ELEMENT_SIZE;
-               skip_size *= mono_array_length ((MonoArray*)start);
-               skip_size += sizeof (MonoArray);
-#endif
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_SMALL_BITMAP) {
-       case DESC_TYPE_SMALL_BITMAP:
-               OBJ_BITMAP_FOREACH_PTR (desc,start);
-               OBJ_BITMAP_SIZE (skip_size, desc, start);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_LARGE_BITMAP) {
-       case DESC_TYPE_LARGE_BITMAP:
-               OBJ_LARGE_BITMAP_FOREACH_PTR (vt,start);
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_COMPLEX) {
-       case DESC_TYPE_COMPLEX:
-               OBJ_COMPLEX_FOREACH_PTR (vt, start);
-               /* this is a complex object */
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       //} else if (type == DESC_TYPE_COMPLEX_ARR) {
-       case DESC_TYPE_COMPLEX_ARR:
-               OBJ_COMPLEX_ARR_FOREACH_PTR (vt, start);
-               /* this is an array of complex structs */
-               skip_size = safe_object_get_size ((MonoObject*)start);
-#if 0
-               skip_size = mono_array_element_size (((MonoObject*)start)->vtable->klass);
-               skip_size *= mono_array_length ((MonoArray*)start);
-               skip_size += sizeof (MonoArray);
-#endif
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       }
-       g_assert_not_reached ();
-       return NULL;
+       return start;
 }
 
 /*
@@ -1737,14 +2172,12 @@ scan_object (char *start, char* from_start, char* from_end)
 static void inline
 drain_gray_stack (char *start_addr, char *end_addr)
 {
-       char *gray_start = gray_first;
+       char *obj;
 
-       while (gray_start < gray_objects) {
-               DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", gray_start, safe_name (gray_start)));
-               gray_start = scan_object (gray_start, start_addr, end_addr);
+       while ((obj = gray_object_dequeue ())) {
+               DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", obj, safe_name (obj)));
+               scan_object (obj, start_addr, end_addr);
        }
-
-       gray_first = gray_start;
 }
 
 /*
@@ -1785,10 +2218,14 @@ scan_vtype (char *start, mword desc, char* from_start, char* from_end)
        return NULL;
 }
 
+#include "sgen-pinning-stats.c"
+
 /*
- * Addresses from start to end are already sorted. This function finds the object header
- * for each address and pins the object. The addresses must be inside the passed section.
- * Return the number of pinned objects.
+ * Addresses from start to end are already sorted. This function finds
+ * the object header for each address and pins the object. The
+ * addresses must be inside the passed section.  The (start of the)
+ * address array is overwritten with the addresses of the actually
+ * pinned objects.  Return the number of pinned objects.
  */
 static int
 pin_objects_from_addresses (GCMemSection *section, void **start, void **end, void *start_nursery, void *end_nursery)
@@ -1845,6 +2282,8 @@ pin_objects_from_addresses (GCMemSection *section, void **start, void **end, voi
                                if (addr >= search_start && (char*)addr < (char*)last_obj + last_obj_size) {
                                        DEBUG (4, fprintf (gc_debug_file, "Pinned object %p, vtable %p (%s), count %d\n", search_start, *(void**)search_start, safe_name (search_start), count));
                                        pin_object (search_start);
+                                       if (heap_dump_file)
+                                               pin_stats_register_object (search_start, last_obj_size);
                                        definitely_pinned [count] = search_start;
                                        count++;
                                        break;
@@ -1955,25 +2394,49 @@ optimize_pin_queue (int start_slot)
        
 }
 
+static int
+optimized_pin_queue_search (void *addr)
+{
+       int first = 0, last = next_pin_slot;
+       while (first < last) {
+               int middle = first + ((last - first) >> 1);
+               if (addr <= pin_queue [middle])
+                       last = middle;
+               else
+                       first = middle + 1;
+       }
+       g_assert (first == last);
+       return first;
+}
+
+static void
+find_optimized_pin_queue_area (void *start, void *end, int *first, int *last)
+{
+       *first = optimized_pin_queue_search (start);
+       *last = optimized_pin_queue_search (end);
+}
+
 static void
 realloc_pin_queue (void)
 {
        int new_size = pin_queue_size? pin_queue_size + pin_queue_size/2: 1024;
-       void **new_pin = get_internal_mem (sizeof (void*) * new_size);
+       void **new_pin = get_internal_mem (sizeof (void*) * new_size, INTERNAL_MEM_PIN_QUEUE);
        memcpy (new_pin, pin_queue, sizeof (void*) * next_pin_slot);
-       free_internal_mem (pin_queue);
+       free_internal_mem (pin_queue, INTERNAL_MEM_PIN_QUEUE);
        pin_queue = new_pin;
        pin_queue_size = new_size;
        DEBUG (4, fprintf (gc_debug_file, "Reallocated pin queue to size: %d\n", new_size));
 }
 
+#include "sgen-pinning.c"
+
 /* 
  * Scan the memory between start and end and queue values which could be pointers
  * to the area between start_nursery and end_nursery for later consideration.
  * Typically used for thread stacks.
  */
 static void
-conservatively_pin_objects_from (void **start, void **end, void *start_nursery, void *end_nursery)
+conservatively_pin_objects_from (void **start, void **end, void *start_nursery, void *end_nursery, int pin_type)
 {
        int count = 0;
        while (start < end) {
@@ -1997,24 +2460,16 @@ conservatively_pin_objects_from (void **start, void **end, void *start_nursery,
                         */
                        mword addr = (mword)*start;
                        addr &= ~(ALLOC_ALIGN - 1);
-                       if (next_pin_slot >= pin_queue_size)
-                               realloc_pin_queue ();
-                       pin_queue [next_pin_slot++] = (void*)addr;
+                       if (addr >= (mword)start_nursery && addr < (mword)end_nursery)
+                               pin_stage_ptr ((void*)addr);
+                       if (heap_dump_file)
+                               pin_stats_register_address ((char*)addr, pin_type);
                        DEBUG (6, if (count) fprintf (gc_debug_file, "Pinning address %p\n", (void*)addr));
                        count++;
                }
                start++;
        }
        DEBUG (7, if (count) fprintf (gc_debug_file, "found %d potential pinned heap pointers\n", count));
-
-#ifdef HAVE_VALGRIND_MEMCHECK_H
-       /*
-        * The pinning addresses might come from undefined memory, this is normal. Since they
-        * are used in lots of functions, we make the memory defined here instead of having
-        * to add a supression for those functions.
-        */
-       VALGRIND_MAKE_MEM_DEFINED (pin_queue, next_pin_slot * sizeof (pin_queue [0]));
-#endif
 }
 
 /* 
@@ -2077,7 +2532,7 @@ pin_from_roots (void *start_nursery, void *end_nursery)
        for (i = 0; i < roots_hash_size [ROOT_TYPE_PINNED]; ++i) {
                for (root = roots_hash [ROOT_TYPE_PINNED][i]; root; root = root->next) {
                        DEBUG (6, fprintf (gc_debug_file, "Pinned roots %p-%p\n", root->start_root, root->end_root));
-                       conservatively_pin_objects_from ((void**)root->start_root, (void**)root->end_root, start_nursery, end_nursery);
+                       conservatively_pin_objects_from ((void**)root->start_root, (void**)root->end_root, start_nursery, end_nursery, PIN_TYPE_OTHER);
                }
        }
        /* now deal with the thread stacks
@@ -2088,6 +2543,8 @@ pin_from_roots (void *start_nursery, void *end_nursery)
         * *) pointers slots in managed frames
         */
        scan_thread_data (start_nursery, end_nursery, FALSE);
+
+       evacuate_pin_staging_area ();
 }
 
 /* Copy function called from user defined mark functions */
@@ -2170,11 +2627,32 @@ alloc_fragment (void)
                frag->next = NULL;
                return frag;
        }
-       frag = get_internal_mem (sizeof (Fragment));
+       frag = get_internal_mem (sizeof (Fragment), INTERNAL_MEM_FRAGMENT);
        frag->next = NULL;
        return frag;
 }
 
+/* size must be a power of 2 */
+static void*
+get_os_memory_aligned (mword size, gboolean activate)
+{
+       /* Allocate twice the memory to be able to put the block on an aligned address */
+       char *mem = get_os_memory (size * 2, activate);
+       char *aligned;
+
+       g_assert (mem);
+
+       aligned = (char*)((mword)(mem + (size - 1)) & ~(size - 1));
+       g_assert (aligned >= mem && aligned + size <= mem + size * 2 && !((mword)aligned & (size - 1)));
+
+       if (aligned > mem)
+               free_os_memory (mem, aligned - mem);
+       if (aligned + size < mem + size * 2)
+               free_os_memory (aligned + size, (mem + size * 2) - (aligned + size));
+
+       return aligned;
+}
+
 /*
  * Allocate and setup the data structures needed to be able to allocate objects
  * in the nursery. The nursery is stored in nursery_section.
@@ -2196,37 +2674,31 @@ alloc_nursery (void)
         * objects in the existing nursery.
         */
        /* FIXME: handle OOM */
-       section = get_internal_mem (sizeof (GCMemSection));
+       section = get_internal_mem (SIZEOF_GC_MEM_SECTION, INTERNAL_MEM_SECTION);
 
-#ifdef ALIGN_NURSERY
-       /* Allocate twice the memory to be able to put the nursery at an aligned address */
        g_assert (nursery_size == DEFAULT_NURSERY_SIZE);
-
-       alloc_size = nursery_size * 2;
-       data = get_os_memory (alloc_size, TRUE);
-       nursery_start = (void*)(((mword)data + (1 << DEFAULT_NURSERY_BITS) - 1) & ~((1 << DEFAULT_NURSERY_BITS) - 1));
-       g_assert ((char*)nursery_start + nursery_size <= ((char*)data + alloc_size));
-       /* FIXME: Use the remaining size for something else, if it is big enough */
-#else
        alloc_size = nursery_size;
+#ifdef ALIGN_NURSERY
+       data = get_os_memory_aligned (alloc_size, TRUE);
+#else
        data = get_os_memory (alloc_size, TRUE);
-       nursery_start = data;
 #endif
+       nursery_start = data;
        nursery_real_end = nursery_start + nursery_size;
        UPDATE_HEAP_BOUNDARIES (nursery_start, nursery_real_end);
        nursery_next = nursery_start;
        total_alloc += alloc_size;
-       DEBUG (4, fprintf (gc_debug_file, "Expanding heap size: %zd, total: %zd\n", nursery_size, total_alloc));
+       DEBUG (4, fprintf (gc_debug_file, "Expanding nursery size (%p-%p): %zd, total: %zd\n", data, data + alloc_size, nursery_size, total_alloc));
        section->data = section->next_data = data;
        section->size = alloc_size;
        section->end_data = nursery_real_end;
        scan_starts = alloc_size / SCAN_START_SIZE;
-       section->scan_starts = get_internal_mem (sizeof (char*) * scan_starts);
+       section->scan_starts = get_internal_mem (sizeof (char*) * scan_starts, INTERNAL_MEM_SCAN_STARTS);
        section->num_scan_start = scan_starts;
-       section->role = MEMORY_ROLE_GEN0;
+       section->block.role = MEMORY_ROLE_GEN0;
 
        /* add to the section list */
-       section->next = section_list;
+       section->block.next = section_list;
        section_list = section;
 
        nursery_section = section;
@@ -2262,8 +2734,8 @@ scan_old_generation (char *start, char* end)
        GCMemSection *section;
        LOSObject *big_object;
        char *p;
-       
-       for (section = section_list; section; section = section->next) {
+
+       for (section = section_list; section; section = section->block.next) {
                if (section == nursery_section)
                        continue;
                DEBUG (2, fprintf (gc_debug_file, "Scan of old section: %p-%p, size: %d\n", section->data, section->next_data, (int)(section->next_data - section->data)));
@@ -2333,13 +2805,109 @@ scan_needed_big_objects (char *start_addr, char *end_addr)
        return count;
 }
 
+static const char*
+generation_name (int generation)
+{
+       switch (generation) {
+       case GENERATION_NURSERY: return "nursery";
+       case GENERATION_OLD: return "old";
+       default: g_assert_not_reached ();
+       }
+}
+
+static DisappearingLinkHashTable*
+get_dislink_hash_table (int generation)
+{
+       switch (generation) {
+       case GENERATION_NURSERY: return &minor_disappearing_link_hash;
+       case GENERATION_OLD: return &major_disappearing_link_hash;
+       default: g_assert_not_reached ();
+       }
+}
+
+static FinalizeEntryHashTable*
+get_finalize_entry_hash_table (int generation)
+{
+       switch (generation) {
+       case GENERATION_NURSERY: return &minor_finalizable_hash;
+       case GENERATION_OLD: return &major_finalizable_hash;
+       default: g_assert_not_reached ();
+       }
+}
+
+static void
+new_to_space_section (void)
+{
+       /* FIXME: if the current to_space_section is empty, we don't
+          have to allocate a new one */
+
+       to_space_section = alloc_major_section ();
+       to_space_bumper = to_space_section->next_data;
+       to_space_top = to_space_section->end_data;
+}
+
+static void
+to_space_set_next_data (void)
+{
+       g_assert (to_space_bumper >= to_space_section->next_data && to_space_bumper <= to_space_section->end_data);
+       to_space_section->next_data = to_space_bumper;
+}
+
+static void
+to_space_expand (void)
+{
+       if (to_space_section) {
+               g_assert (to_space_top == to_space_section->end_data);
+               to_space_set_next_data ();
+       }
+
+       new_to_space_section ();
+}
+
+static void
+unset_to_space (void)
+{
+       /* between collections the to_space_bumper is invalidated
+          because degraded allocations might occur, so we set it to
+          NULL, just to make it explicit */
+       to_space_bumper = NULL;
+
+       /* don't unset to_space_section if we implement the FIXME in
+          new_to_space_section */
+       to_space_section = NULL;
+}
+
+static gboolean
+object_is_in_to_space (char *obj)
+{
+       mword objsize;
+
+       /* nursery */
+       if (ptr_in_nursery (obj))
+               return FALSE;
+
+       objsize = safe_object_get_size ((MonoObject*)obj);
+       objsize += ALLOC_ALIGN - 1;
+       objsize &= ~(ALLOC_ALIGN - 1);
+
+       /* LOS */
+       if (objsize > MAX_SMALL_OBJ_SIZE)
+               return FALSE;
+
+       /* pinned chunk */
+       if (obj_is_from_pinned_alloc (obj))
+               return FALSE;
+
+       /* now we know it's in a major heap section */
+       return MAJOR_SECTION_FOR_OBJECT (obj)->is_to_space;
+}
+
 static void
-finish_gray_stack (char *start_addr, char *end_addr)
+finish_gray_stack (char *start_addr, char *end_addr, int generation)
 {
        TV_DECLARE (atv);
        TV_DECLARE (btv);
        int fin_ready, bigo_scanned_num;
-       char *gray_start;
 
        /*
         * We copied all the reachable objects. Now it's the time to copy
@@ -2354,14 +2922,10 @@ finish_gray_stack (char *start_addr, char *end_addr)
         *   To achieve better cache locality and cache usage, we drain the gray stack 
         * frequently, after each object is copied, and just finish the work here.
         */
-       gray_start = gray_first;
-       while (gray_start < gray_objects) {
-               DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", gray_start, safe_name (gray_start)));
-               gray_start = scan_object (gray_start, start_addr, end_addr);
-       }
+       drain_gray_stack (start_addr, end_addr);
        TV_GETTIME (atv);
        //scan_old_generation (start_addr, end_addr);
-       DEBUG (2, fprintf (gc_debug_file, "Old generation done\n"));
+       DEBUG (2, fprintf (gc_debug_file, "%s generation done\n", generation_name (generation)));
        /* walk the finalization queue and move also the objects that need to be
         * finalized: use the finalized objects as new roots so the objects they depend
         * on are also not reclaimed. As with the roots above, only objects in the nursery
@@ -2371,20 +2935,17 @@ finish_gray_stack (char *start_addr, char *end_addr)
         */
        do {
                fin_ready = num_ready_finalizers;
-               finalize_in_range (start_addr, end_addr);
+               finalize_in_range (start_addr, end_addr, generation);
+               if (generation == GENERATION_OLD)
+                       finalize_in_range (nursery_start, nursery_real_end, GENERATION_NURSERY);
                bigo_scanned_num = scan_needed_big_objects (start_addr, end_addr);
 
                /* drain the new stack that might have been created */
-               DEBUG (6, fprintf (gc_debug_file, "Precise scan of gray area post fin: %p-%p, size: %d\n", gray_start, gray_objects, (int)(gray_objects - gray_start)));
-               while (gray_start < gray_objects) {
-                       DEBUG (9, fprintf (gc_debug_file, "Precise gray object scan %p (%s)\n", gray_start, safe_name (gray_start)));
-                       gray_start = scan_object (gray_start, start_addr, end_addr);
-               }
+               DEBUG (6, fprintf (gc_debug_file, "Precise scan of gray area post fin\n"));
+               drain_gray_stack (start_addr, end_addr);
        } while (fin_ready != num_ready_finalizers || bigo_scanned_num);
-
-       DEBUG (2, fprintf (gc_debug_file, "Copied to old space: %d bytes\n", (int)(gray_objects - to_space)));
-       to_space = gray_start;
-       to_space_section->next_data = to_space;
+       TV_GETTIME (btv);
+       DEBUG (2, fprintf (gc_debug_file, "Finalize queue handling scan for %s generation: %d usecs\n", generation_name (generation), TV_ELAPSED (atv, btv)));
 
        /*
         * handle disappearing links
@@ -2394,9 +2955,19 @@ finish_gray_stack (char *start_addr, char *end_addr)
         * GC a finalized object my lose the monitor because it is cleared before the finalizer is
         * called.
         */
-       null_link_in_range (start_addr, end_addr);
-       TV_GETTIME (btv);
-       DEBUG (2, fprintf (gc_debug_file, "Finalize queue handling scan: %d usecs\n", TV_ELAPSED (atv, btv)));
+       g_assert (gray_object_queue_is_empty ());
+       for (;;) {
+               null_link_in_range (start_addr, end_addr, generation);
+               if (generation == GENERATION_OLD)
+                       null_link_in_range (start_addr, end_addr, GENERATION_NURSERY);
+               if (gray_object_queue_is_empty ())
+                       break;
+               drain_gray_stack (start_addr, end_addr);
+       }
+
+       g_assert (gray_object_queue_is_empty ());
+       /* DEBUG (2, fprintf (gc_debug_file, "Copied from %s to old space: %d bytes (%p-%p)\n", generation_name (generation), (int)(to_space_bumper - to_space), to_space, to_space_bumper)); */
+       to_space_set_next_data ();
 }
 
 static int last_num_pinned = 0;
@@ -2408,9 +2979,12 @@ build_nursery_fragments (int start_pin, int end_pin)
        size_t frag_size;
        int i;
 
-       /* FIXME: handle non-NULL fragment_freelist */
-       fragment_freelist = nursery_fragments;
-       nursery_fragments = NULL;
+       while (nursery_fragments) {
+               Fragment *next = nursery_fragments->next;
+               nursery_fragments->next = fragment_freelist;
+               fragment_freelist = nursery_fragments;
+               nursery_fragments = next;
+       }
        frag_start = nursery_start;
        fragment_total = 0;
        /* clear scan starts */
@@ -2427,30 +3001,6 @@ build_nursery_fragments (int start_pin, int end_pin)
                frag_size += ALLOC_ALIGN - 1;
                frag_size &= ~(ALLOC_ALIGN - 1);
                frag_start = (char*)pin_queue [i] + frag_size;
-               /* 
-                * pin_queue [i] might point to a half-constructed string or vector whose
-                * length field is not set. In that case, frag_start points inside the 
-                * (zero initialized) object. Find the end of the object by scanning forward.
-                * 
-                */
-               if (is_maybe_half_constructed (pin_queue [i])) {
-                       char *tlab_end;
-
-                       /* This is also hit for zero length arrays/strings */
-
-                       /* Find the end of the TLAB which contained this allocation */
-                       tlab_end = find_tlab_next_from_address (pin_queue [i]);
-
-                       if (tlab_end) {
-                               while ((frag_start < tlab_end) && *(mword*)frag_start == 0)
-                                       frag_start += sizeof (mword);
-                       } else {
-                               /*
-                                * FIXME: The object is either not allocated in a TLAB, or it isn't a
-                                * half constructed object.
-                                */
-                       }
-               }
        }
        nursery_last_pinned_end = frag_start;
        frag_end = nursery_real_end;
@@ -2523,66 +3073,237 @@ scan_from_registered_roots (char *addr_start, char *addr_end, int root_type)
        }
 }
 
-/*
- * Collect objects in the nursery.
- */
 static void
-collect_nursery (size_t requested_size)
+dump_occupied (char *start, char *end, char *section_start)
 {
-       GCMemSection *section;
-       size_t max_garbage_amount;
-       int i;
-       char *orig_nursery_next;
-       Fragment *frag;
-       TV_DECLARE (all_atv);
-       TV_DECLARE (all_btv);
-       TV_DECLARE (atv);
-       TV_DECLARE (btv);
+       fprintf (heap_dump_file, "<occupied offset=\"%zd\" size=\"%zd\"/>\n", start - section_start, end - start);
+}
 
-       degraded_mode = 0;
-       orig_nursery_next = nursery_next;
-       nursery_next = MAX (nursery_next, nursery_last_pinned_end);
-       /* FIXME: optimize later to use the higher address where an object can be present */
-       nursery_next = MAX (nursery_next, nursery_real_end);
+static void
+dump_section (GCMemSection *section, const char *type)
+{
+       char *start = section->data;
+       char *end = section->data + section->size;
+       char *occ_start = NULL;
+       GCVTable *vt;
+       char *old_start = NULL; /* just for debugging */
 
-       if (consistency_check_at_minor_collection)
-               check_consistency ();
+       fprintf (heap_dump_file, "<section type=\"%s\" size=\"%zu\">\n", type, section->size);
 
-       DEBUG (1, fprintf (gc_debug_file, "Start nursery collection %d %p-%p, size: %d\n", num_minor_gcs, nursery_start, nursery_next, (int)(nursery_next - nursery_start)));
-       max_garbage_amount = nursery_next - nursery_start;
+       while (start < end) {
+               guint size;
+               MonoClass *class;
 
-       /* Clear all remaining nursery fragments, pinning depends on this */
-       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
-               g_assert (orig_nursery_next <= nursery_frag_real_end);
-               memset (orig_nursery_next, 0, nursery_frag_real_end - orig_nursery_next);
-               for (frag = nursery_fragments; frag; frag = frag->next) {
-                       memset (frag->fragment_start, 0, frag->fragment_end - frag->fragment_start);
+               if (!*(void**)start) {
+                       if (occ_start) {
+                               dump_occupied (occ_start, start, section->data);
+                               occ_start = NULL;
+                       }
+                       start += sizeof (void*); /* should be ALLOC_ALIGN, really */
+                       continue;
                }
-       }
+               g_assert (start < section->next_data);
+
+               if (!occ_start)
+                       occ_start = start;
+
+               vt = (GCVTable*)LOAD_VTABLE (start);
+               class = vt->klass;
+
+               size = safe_object_get_size ((MonoObject*) start);
+               size += ALLOC_ALIGN - 1;
+               size &= ~(ALLOC_ALIGN - 1);
+
+               /*
+               fprintf (heap_dump_file, "<object offset=\"%d\" class=\"%s.%s\" size=\"%d\"/>\n",
+                               start - section->data,
+                               vt->klass->name_space, vt->klass->name,
+                               size);
+               */
+
+               old_start = start;
+               start += size;
+       }
+       if (occ_start)
+               dump_occupied (occ_start, start, section->data);
+
+       fprintf (heap_dump_file, "</section>\n");
+}
+
+static void
+dump_heap (const char *type, int num, const char *reason)
+{
+       static char const *internal_mem_names [] = { "pin-queue", "fragment", "section", "scan-starts",
+                                                    "fin-table", "finalize-entry", "dislink-table",
+                                                    "dislink", "roots-table", "root-record", "statistics",
+                                                    "remset", "gray-queue", "store-remset" };
+
+       GCMemSection *section;
+       LOSObject *bigobj;
+       int i;
+
+       fprintf (heap_dump_file, "<collection type=\"%s\" num=\"%d\"", type, num);
+       if (reason)
+               fprintf (heap_dump_file, " reason=\"%s\"", reason);
+       fprintf (heap_dump_file, ">\n");
+       fprintf (heap_dump_file, "<other-mem-usage type=\"pinned-chunks\" size=\"%ld\"/>\n", pinned_chunk_bytes_alloced);
+       fprintf (heap_dump_file, "<other-mem-usage type=\"large-internal\" size=\"%ld\"/>\n", large_internal_bytes_alloced);
+       fprintf (heap_dump_file, "<other-mem-usage type=\"mempools\" size=\"%ld\"/>\n", mono_mempool_get_bytes_allocated ());
+       for (i = 0; i < INTERNAL_MEM_MAX; ++i)
+               fprintf (heap_dump_file, "<other-mem-usage type=\"%s\" size=\"%ld\"/>\n", internal_mem_names [i], small_internal_mem_bytes [i]);
+       fprintf (heap_dump_file, "<pinned type=\"stack\" bytes=\"%zu\"/>\n", pinned_byte_counts [PIN_TYPE_STACK]);
+       /* fprintf (heap_dump_file, "<pinned type=\"static-data\" bytes=\"%d\"/>\n", pinned_byte_counts [PIN_TYPE_STATIC_DATA]); */
+       fprintf (heap_dump_file, "<pinned type=\"other\" bytes=\"%zu\"/>\n", pinned_byte_counts [PIN_TYPE_OTHER]);
+
+       dump_section (nursery_section, "nursery");
+
+       for (section = section_list; section; section = section->block.next) {
+               if (section != nursery_section)
+                       dump_section (section, "old");
+       }
+
+       fprintf (heap_dump_file, "<los>\n");
+       for (bigobj = los_object_list; bigobj; bigobj = bigobj->next) {
+               MonoObject *obj = (MonoObject*) bigobj->data;
+               MonoClass *class = mono_object_class (obj);
+
+               fprintf (heap_dump_file, "<object class=\"%s.%s\" size=\"%d\"/>\n",
+                               class->name_space, class->name,
+                               safe_object_get_size (obj));
+       }
+       fprintf (heap_dump_file, "</los>\n");
+
+       fprintf (heap_dump_file, "</collection>\n");
+}
+
+static void
+init_stats (void)
+{
+       static gboolean inited = FALSE;
+
+#ifdef HEAVY_STATISTICS
+       num_copy_object_called = 0;
+       num_objects_copied = 0;
+#endif
+
+       if (inited)
+               return;
+
+#ifdef HEAVY_STATISTICS
+       mono_counters_register ("WBarrier set field", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_set_field);
+       mono_counters_register ("WBarrier set arrayref", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_set_arrayref);
+       mono_counters_register ("WBarrier arrayref copy", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_arrayref_copy);
+       mono_counters_register ("WBarrier generic store called", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_generic_store);
+       mono_counters_register ("WBarrier generic store stored", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_generic_store_remset);
+       mono_counters_register ("WBarrier set root", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_set_root);
+       mono_counters_register ("WBarrier value copy", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_value_copy);
+       mono_counters_register ("WBarrier object copy", MONO_COUNTER_GC | MONO_COUNTER_INT, &stat_wbarrier_object_copy);
+
+       mono_counters_register ("# objects allocated", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_objects_alloced);
+       mono_counters_register ("# copy_object() called (nursery)", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_called_nursery);
+       mono_counters_register ("# objects copied (nursery)", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_objects_copied_nursery);
+       mono_counters_register ("# copy_object() called (major)", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_called_major);
+       mono_counters_register ("# objects copied (major)", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_objects_copied_major);
+
+       mono_counters_register ("# copy_object() failed from space", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_failed_from_space);
+       mono_counters_register ("# copy_object() failed forwarded", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_failed_forwarded);
+       mono_counters_register ("# copy_object() failed pinned", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_failed_pinned);
+       mono_counters_register ("# copy_object() failed large or pinned chunk", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_failed_large_pinned);
+       mono_counters_register ("# copy_object() failed to space", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_copy_object_failed_to_space);
+
+       mono_counters_register ("Store remsets", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_store_remsets);
+       mono_counters_register ("Unique store remsets", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_store_remsets_unique);
+       mono_counters_register ("Saved remsets 1", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_saved_remsets_1);
+       mono_counters_register ("Saved remsets 2", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_saved_remsets_2);
+       mono_counters_register ("Global remsets added", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_global_remsets_added);
+       mono_counters_register ("Global remsets processed", MONO_COUNTER_GC | MONO_COUNTER_LONG, &stat_global_remsets_processed);
+#endif
+
+       inited = TRUE;
+}
+
+static void
+commit_stats (int generation)
+{
+#ifdef HEAVY_STATISTICS
+       if (generation == GENERATION_NURSERY) {
+               stat_copy_object_called_nursery += num_copy_object_called;
+               stat_objects_copied_nursery += num_objects_copied;
+       } else {
+               g_assert (generation == GENERATION_OLD);
+               stat_copy_object_called_major += num_copy_object_called;
+               stat_objects_copied_major += num_objects_copied;
+       }
+#endif
+}
+
+/*
+ * Collect objects in the nursery.  Returns whether to trigger a major
+ * collection.
+ */
+static gboolean
+collect_nursery (size_t requested_size)
+{
+       size_t max_garbage_amount;
+       int i;
+       char *orig_nursery_next;
+       Fragment *frag;
+       GCMemSection *section;
+       int old_num_major_sections = num_major_sections;
+       int sections_alloced;
+       TV_DECLARE (all_atv);
+       TV_DECLARE (all_btv);
+       TV_DECLARE (atv);
+       TV_DECLARE (btv);
+
+       init_stats ();
+
+       degraded_mode = 0;
+       orig_nursery_next = nursery_next;
+       nursery_next = MAX (nursery_next, nursery_last_pinned_end);
+       /* FIXME: optimize later to use the higher address where an object can be present */
+       nursery_next = MAX (nursery_next, nursery_real_end);
+
+       if (consistency_check_at_minor_collection)
+               check_consistency ();
+
+       DEBUG (1, fprintf (gc_debug_file, "Start nursery collection %d %p-%p, size: %d\n", num_minor_gcs, nursery_start, nursery_next, (int)(nursery_next - nursery_start)));
+       max_garbage_amount = nursery_next - nursery_start;
+       g_assert (nursery_section->size >= max_garbage_amount);
+
+       /* Clear all remaining nursery fragments, pinning depends on this */
+       if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION) {
+               g_assert (orig_nursery_next <= nursery_frag_real_end);
+               memset (orig_nursery_next, 0, nursery_frag_real_end - orig_nursery_next);
+               for (frag = nursery_fragments; frag; frag = frag->next) {
+                       memset (frag->fragment_start, 0, frag->fragment_end - frag->fragment_start);
+               }
+       }
+
+       if (xdomain_checks)
+               check_for_xdomain_refs ();
 
-       /* 
-        * not enough room in the old generation to store all the possible data from 
-        * the nursery in a single continuous space.
-        * We reset to_space if we allocated objects in degraded mode.
-        */
-       if (to_space_section)
-               to_space = gray_objects = gray_first = to_space_section->next_data;
-       if ((to_space_end - to_space) < max_garbage_amount) {
-               section = alloc_section (nursery_section->size * 4);
-               g_assert (nursery_section->size >= max_garbage_amount);
-               to_space = gray_objects = gray_first = section->next_data;
-               to_space_end = section->end_data;
-               to_space_section = section;
-       }
-       DEBUG (2, fprintf (gc_debug_file, "To space setup: %p-%p in section %p\n", to_space, to_space_end, to_space_section));
        nursery_section->next_data = nursery_next;
 
+       if (!to_space_section) {
+               new_to_space_section ();
+       } else {
+               /* we might have done degraded allocation since the
+                  last collection */
+               g_assert (to_space_bumper <= to_space_section->next_data);
+               to_space_bumper = to_space_section->next_data;
+
+               to_space_section->is_to_space = TRUE;
+       }
+       gray_object_queue_init ();
+
        num_minor_gcs++;
        mono_stats.minor_gc_count ++;
        /* world must be stopped already */
        TV_GETTIME (all_atv);
        TV_GETTIME (atv);
        /* pin from pinned handles */
+       init_pinning ();
        pin_from_roots (nursery_start, nursery_next);
        /* identify pinned objects */
        optimize_pin_queue (0);
@@ -2608,13 +3329,14 @@ collect_nursery (size_t requested_size)
        }
        /* registered roots, this includes static fields */
        scan_from_registered_roots (nursery_start, nursery_next, ROOT_TYPE_NORMAL);
+       scan_from_registered_roots (nursery_start, nursery_next, ROOT_TYPE_WBARRIER);
        scan_thread_data (nursery_start, nursery_next, TRUE);
        /* alloc_pinned objects */
        scan_from_pinned_objects (nursery_start, nursery_next);
        TV_GETTIME (btv);
        DEBUG (2, fprintf (gc_debug_file, "Root scan: %d usecs\n", TV_ELAPSED (atv, btv)));
 
-       finish_gray_stack (nursery_start, nursery_next);
+       finish_gray_stack (nursery_start, nursery_next, GENERATION_NURSERY);
 
        /* walk the pin_queue, build up the fragment list of free memory, unmark
         * pinned objects as we go, memzero() the empty fragments so they are ready for the
@@ -2624,9 +3346,17 @@ collect_nursery (size_t requested_size)
        TV_GETTIME (atv);
        DEBUG (2, fprintf (gc_debug_file, "Fragment creation: %d usecs, %zd bytes available\n", TV_ELAPSED (btv, atv), fragment_total));
 
+       for (section = section_list; section; section = section->block.next) {
+               if (section->is_to_space)
+                       section->is_to_space = FALSE;
+       }
+
        TV_GETTIME (all_btv);
        mono_stats.minor_gc_time_usecs += TV_ELAPSED (all_atv, all_btv);
 
+       if (heap_dump_file)
+               dump_heap ("minor", num_minor_gcs - 1, NULL);
+
        /* prepare the pin queue for the next collection */
        last_num_pinned = next_pin_slot;
        next_pin_slot = 0;
@@ -2634,17 +3364,33 @@ collect_nursery (size_t requested_size)
                DEBUG (4, fprintf (gc_debug_file, "Finalizer-thread wakeup: ready %d\n", num_ready_finalizers));
                mono_gc_finalize_notify ();
        }
+       pin_stats_reset ();
+
+       g_assert (gray_object_queue_is_empty ());
+
+       commit_stats (GENERATION_NURSERY);
+
+       sections_alloced = num_major_sections - old_num_major_sections;
+       minor_collection_sections_alloced += sections_alloced;
+
+       return minor_collection_sections_alloced > minor_collection_section_allowance;
+}
+
+static void
+scan_from_pinned_chunk_if_marked (PinnedChunk *chunk, char *obj, size_t size, void *dummy)
+{
+       if (object_is_pinned (obj))
+               scan_object (obj, NULL, (char*)-1);
 }
 
 static void
-major_collection (void)
+major_collection (const char *reason)
 {
        GCMemSection *section, *prev_section;
        LOSObject *bigobj, *prevbo;
        int i;
        PinnedChunk *chunk;
        Fragment *frag;
-       int count;
        TV_DECLARE (all_atv);
        TV_DECLARE (all_btv);
        TV_DECLARE (atv);
@@ -2655,6 +3401,10 @@ major_collection (void)
        char *heap_start = NULL;
        char *heap_end = (char*)-1;
        size_t copy_space_required = 0;
+       int old_num_major_sections = num_major_sections;
+       int num_major_sections_saved, save_target, allowance_target;
+
+       init_stats ();
 
        degraded_mode = 0;
        DEBUG (1, fprintf (gc_debug_file, "Start major collection %d\n", num_major_gcs));
@@ -2670,6 +3420,9 @@ major_collection (void)
                }
        }
 
+       if (xdomain_checks)
+               check_for_xdomain_refs ();
+
        /* 
         * FIXME: implement Mark/Compact
         * Until that is done, we can just apply mostly the same alg as for the nursery:
@@ -2690,53 +3443,74 @@ major_collection (void)
        clear_remsets ();
        /* world must be stopped already */
        TV_GETTIME (atv);
+       init_pinning ();
+       DEBUG (6, fprintf (gc_debug_file, "Collecting pinned addresses\n"));
+       pin_from_roots ((void*)lowest_heap_address, (void*)highest_heap_address);
+       optimize_pin_queue (0);
+
+       /*
+        * pin_queue now contains all candidate pointers, sorted and
+        * uniqued.  We must do two passes now to figure out which
+        * objects are pinned.
+        *
+        * The first is to find within the pin_queue the area for each
+        * section.  This requires that the pin_queue be sorted.  We
+        * also process the LOS objects and pinned chunks here.
+        *
+        * The second, destructive, pass is to reduce the section
+        * areas to pointers to the actually pinned objects.
+        */
        DEBUG (6, fprintf (gc_debug_file, "Pinning from sections\n"));
-       for (section = section_list; section; section = section->next) {
-               section->pin_queue_start = count = section->pin_queue_end = next_pin_slot;
-               pin_from_roots (section->data, section->next_data);
-               if (count != next_pin_slot) {
-                       int reduced_to;
-                       optimize_pin_queue (count);
-                       DEBUG (6, fprintf (gc_debug_file, "Found %d pinning addresses in section %p (%d-%d)\n", next_pin_slot - count, section, count, next_pin_slot));
-                       reduced_to = pin_objects_from_addresses (section, pin_queue + count, pin_queue + next_pin_slot, section->data, section->next_data);
-                       section->pin_queue_end = next_pin_slot = count + reduced_to;
-               }
-               copy_space_required += (char*)section->next_data - (char*)section->data;
+       /* first pass for the sections */
+       for (section = section_list; section; section = section->block.next) {
+               int start, end;
+               DEBUG (6, fprintf (gc_debug_file, "Pinning from section %p (%p-%p)\n", section, section->data, section->end_data));
+               find_optimized_pin_queue_area (section->data, section->end_data, &start, &end);
+               DEBUG (6, fprintf (gc_debug_file, "Found %d pinning addresses in section %p (%d-%d)\n",
+                                               end - start, section, start, end));
+               section->pin_queue_start = start;
+               section->pin_queue_end = end;
        }
        /* identify possible pointers to the insize of large objects */
        DEBUG (6, fprintf (gc_debug_file, "Pinning from large objects\n"));
        for (bigobj = los_object_list; bigobj; bigobj = bigobj->next) {
-               count = next_pin_slot;
-               pin_from_roots (bigobj->data, (char*)bigobj->data + bigobj->size);
-               /* FIXME: this is only valid until we don't optimize the pin queue midway */
-               if (next_pin_slot != count) {
-                       next_pin_slot = count;
+               int start, end;
+               find_optimized_pin_queue_area (bigobj->data, (char*)bigobj->data + bigobj->size, &start, &end);
+               if (start != end) {
                        pin_object (bigobj->data);
+                       if (heap_dump_file)
+                               pin_stats_register_object ((char*) bigobj->data, safe_object_get_size ((MonoObject*) bigobj->data));
                        DEBUG (6, fprintf (gc_debug_file, "Marked large object %p (%s) size: %zd from roots\n", bigobj->data, safe_name (bigobj->data), bigobj->size));
                }
        }
        /* look for pinned addresses for pinned-alloc objects */
        DEBUG (6, fprintf (gc_debug_file, "Pinning from pinned-alloc objects\n"));
-       for (chunk = pinned_chunk_list; chunk; chunk = chunk->next) {
-               count = next_pin_slot;
-               pin_from_roots (chunk->start_data, (char*)chunk + chunk->num_pages * FREELIST_PAGESIZE);
-               /* FIXME: this is only valid until we don't optimize the pin queue midway */
-               if (next_pin_slot != count) {
-                       mark_pinned_from_addresses (chunk, pin_queue + count, pin_queue + next_pin_slot);
-                       next_pin_slot = count;
+       for (chunk = pinned_chunk_list; chunk; chunk = chunk->block.next) {
+               int start, end;
+               find_optimized_pin_queue_area (chunk->start_data, (char*)chunk + chunk->num_pages * FREELIST_PAGESIZE, &start, &end);
+               if (start != end)
+                       mark_pinned_from_addresses (chunk, pin_queue + start, pin_queue + end);
+       }
+       /* second pass for the sections */
+       for (section = section_list; section; section = section->block.next) {
+               int start = section->pin_queue_start;
+               int end = section->pin_queue_end;
+               if (start != end) {
+                       int reduced_to;
+                       reduced_to = pin_objects_from_addresses (section, pin_queue + start, pin_queue + end,
+                                       section->data, section->next_data);
+                       section->pin_queue_start = start;
+                       section->pin_queue_end = start + reduced_to;
                }
+               copy_space_required += (char*)section->next_data - (char*)section->data;
        }
 
        TV_GETTIME (btv);
        DEBUG (2, fprintf (gc_debug_file, "Finding pinned pointers: %d in %d usecs\n", next_pin_slot, TV_ELAPSED (atv, btv)));
        DEBUG (4, fprintf (gc_debug_file, "Start scan with %d pinned objects\n", next_pin_slot));
 
-       /* allocate the big to space */
-       DEBUG (4, fprintf (gc_debug_file, "Allocate tospace for size: %zd\n", copy_space_required));
-       section = alloc_section (copy_space_required);
-       to_space = gray_objects = gray_first = section->next_data;
-       to_space_end = section->end_data;
-       to_space_section = section;
+       new_to_space_section ();
+       gray_object_queue_init ();
 
        /* the old generation doesn't need to be scanned (no remembered sets or card
         * table needed either): the only objects that must survive are those pinned and
@@ -2745,10 +3519,21 @@ major_collection (void)
         * move all the objects.
         */
        /* the pinned objects are roots (big objects are included in this list, too) */
-       for (i = 0; i < next_pin_slot; ++i) {
-               DEBUG (6, fprintf (gc_debug_file, "Precise object scan %d of pinned %p (%s)\n", i, pin_queue [i], safe_name (pin_queue [i])));
-               scan_object (pin_queue [i], heap_start, heap_end);
+       for (section = section_list; section; section = section->block.next) {
+               for (i = section->pin_queue_start; i < section->pin_queue_end; ++i) {
+                       DEBUG (6, fprintf (gc_debug_file, "Precise object scan %d of pinned %p (%s)\n",
+                                                       i, pin_queue [i], safe_name (pin_queue [i])));
+                       scan_object (pin_queue [i], heap_start, heap_end);
+               }
+       }
+       for (bigobj = los_object_list; bigobj; bigobj = bigobj->next) {
+               if (object_is_pinned (bigobj->data)) {
+                       DEBUG (6, fprintf (gc_debug_file, "Precise object scan pinned LOS object %p (%s)\n",
+                                                       bigobj->data, safe_name (bigobj->data)));
+                       scan_object (bigobj->data, heap_start, heap_end);
+               }
        }
+       scan_pinned_objects (scan_from_pinned_chunk_if_marked, NULL);
        /* registered roots, this includes static fields */
        scan_from_registered_roots (heap_start, heap_end, ROOT_TYPE_NORMAL);
        scan_from_registered_roots (heap_start, heap_end, ROOT_TYPE_WBARRIER);
@@ -2768,7 +3553,9 @@ major_collection (void)
         */
        scan_needed_big_objects (heap_start, heap_end);
        /* all the objects in the heap */
-       finish_gray_stack (heap_start, heap_end);
+       finish_gray_stack (heap_start, heap_end, GENERATION_OLD);
+
+       unset_to_space ();
 
        /* sweep the big objects list */
        prevbo = NULL;
@@ -2798,28 +3585,30 @@ major_collection (void)
        prev_section = NULL;
        for (section = section_list; section;) {
                /* to_space doesn't need handling here and the nursery is special */
-               if (section == to_space_section || section == nursery_section) {
+               if (section->is_to_space || section == nursery_section) {
+                       if (section->is_to_space)
+                               section->is_to_space = FALSE;
                        prev_section = section;
-                       section = section->next;
+                       section = section->block.next;
                        continue;
                }
                /* no pinning object, so the section is free */
                if (section->pin_queue_start == section->pin_queue_end) {
                        GCMemSection *to_free;
                        if (prev_section)
-                               prev_section->next = section->next;
+                               prev_section->block.next = section->block.next;
                        else
-                               section_list = section->next;
+                               section_list = section->block.next;
                        to_free = section;
-                       section = section->next;
-                       free_mem_section (to_free);
+                       section = section->block.next;
+                       free_major_section (to_free);
                        continue;
                } else {
                        DEBUG (6, fprintf (gc_debug_file, "Section %p has still pinned objects (%d)\n", section, section->pin_queue_end - section->pin_queue_start));
                        build_section_fragments (section);
                }
                prev_section = section;
-               section = section->next;
+               section = section->block.next;
        }
 
        /* walk the pin_queue, build up the fragment list of free memory, unmark
@@ -2830,68 +3619,79 @@ major_collection (void)
 
        TV_GETTIME (all_btv);
        mono_stats.major_gc_time_usecs += TV_ELAPSED (all_atv, all_btv);
+
+       if (heap_dump_file)
+               dump_heap ("major", num_major_gcs - 1, reason);
+
        /* prepare the pin queue for the next collection */
        next_pin_slot = 0;
        if (fin_ready_list || critical_fin_list) {
                DEBUG (4, fprintf (gc_debug_file, "Finalizer-thread wakeup: ready %d\n", num_ready_finalizers));
                mono_gc_finalize_notify ();
        }
+       pin_stats_reset ();
+
+       g_assert (gray_object_queue_is_empty ());
+
+       commit_stats (GENERATION_OLD);
+
+       num_major_sections_saved = MAX (old_num_major_sections - num_major_sections, 1);
+
+       save_target = num_major_sections / 2;
+       allowance_target = save_target * minor_collection_sections_alloced / num_major_sections_saved;
+
+       minor_collection_section_allowance = MAX (MIN (allowance_target, num_major_sections), MIN_MINOR_COLLECTION_SECTION_ALLOWANCE);
+
+       /*
+       printf ("alloced %d  saved %d  target %d  allowance %d\n",
+                       minor_collection_sections_alloced, num_major_sections_saved, allowance_target,
+                       minor_collection_section_allowance);
+       */
+
+       minor_collection_sections_alloced = 0;
 }
 
 /*
  * Allocate a new section of memory to be used as old generation.
  */
 static GCMemSection*
-alloc_section (size_t size)
+alloc_major_section (void)
 {
        GCMemSection *section;
-       char *data;
        int scan_starts;
-       size_t new_size = next_section_size;
-
-       if (size > next_section_size) {
-               new_size = size;
-               new_size += pagesize - 1;
-               new_size &= ~(pagesize - 1);
-       }
-       section_size_used++;
-       if (section_size_used > 3) {
-               section_size_used = 0;
-               next_section_size *= 2;
-               if (next_section_size > max_section_size)
-                       next_section_size = max_section_size;
-       }
-       section = get_internal_mem (sizeof (GCMemSection));
-       data = get_os_memory (new_size, TRUE);
-       section->data = section->next_data = data;
-       section->size = new_size;
-       section->end_data = data + new_size;
-       UPDATE_HEAP_BOUNDARIES (data, section->end_data);
-       total_alloc += new_size;
-       DEBUG (2, fprintf (gc_debug_file, "Expanding heap size: %zd, total: %zd\n", new_size, total_alloc));
-       section->data = data;
-       section->size = new_size;
-       scan_starts = new_size / SCAN_START_SIZE;
-       section->scan_starts = get_internal_mem (sizeof (char*) * scan_starts);
+
+       section = get_os_memory_aligned (MAJOR_SECTION_SIZE, TRUE);
+       section->next_data = section->data = (char*)section + SIZEOF_GC_MEM_SECTION;
+       g_assert (!((mword)section->data & 7));
+       section->size = MAJOR_SECTION_SIZE - SIZEOF_GC_MEM_SECTION;
+       section->end_data = section->data + section->size;
+       UPDATE_HEAP_BOUNDARIES (section->data, section->end_data);
+       total_alloc += section->size;
+       DEBUG (3, fprintf (gc_debug_file, "New major heap section: (%p-%p), total: %zd\n", section->data, section->end_data, total_alloc));
+       scan_starts = section->size / SCAN_START_SIZE;
+       section->scan_starts = get_internal_mem (sizeof (char*) * scan_starts, INTERNAL_MEM_SCAN_STARTS);
        section->num_scan_start = scan_starts;
-       section->role = MEMORY_ROLE_GEN1;
+       section->block.role = MEMORY_ROLE_GEN1;
+       section->is_to_space = TRUE;
 
        /* add to the section list */
-       section->next = section_list;
+       section->block.next = section_list;
        section_list = section;
 
+       ++num_major_sections;
+
        return section;
 }
 
 static void
-free_mem_section (GCMemSection *section)
+free_major_section (GCMemSection *section)
 {
-       char *data = section->data;
-       size_t size = section->size;
-       DEBUG (2, fprintf (gc_debug_file, "Freed section %p, size %zd\n", data, size));
-       free_os_memory (data, size);
-       free_internal_mem (section);
-       total_alloc -= size;
+       DEBUG (3, fprintf (gc_debug_file, "Freed major section %p (%p-%p)\n", section, section->data, section->end_data));
+       free_internal_mem (section->scan_starts, INTERNAL_MEM_SCAN_STARTS);
+       free_os_memory (section, MAJOR_SECTION_SIZE);
+       total_alloc -= MAJOR_SECTION_SIZE - SIZEOF_GC_MEM_SECTION;
+
+       --num_major_sections;
 }
 
 /*
@@ -2911,7 +3711,8 @@ minor_collect_or_expand_inner (size_t size)
        }
        if (do_minor_collection) {
                stop_world ();
-               collect_nursery (size);
+               if (collect_nursery (size))
+                       major_collection ("minor overflow");
                DEBUG (2, fprintf (gc_debug_file, "Heap size: %zd, LOS size: %zd\n", total_alloc, los_memory_usage));
                restart_world ();
                /* this also sets the proper pointers for the next allocation */
@@ -2960,7 +3761,7 @@ get_os_memory (size_t size, int activate)
 static void
 free_os_memory (void *addr, size_t size)
 {
-       munmap (addr, size);
+       mono_vfree (addr, size);
 }
 
 /*
@@ -3001,12 +3802,12 @@ report_internal_mem_usage (void) {
        int i;
        printf ("Internal memory usage:\n");
        i = 0;
-       for (chunk = internal_chunk_list; chunk; chunk = chunk->next) {
+       for (chunk = internal_chunk_list; chunk; chunk = chunk->block.next) {
                report_pinned_chunk (chunk, i++);
        }
        printf ("Pinned memory usage:\n");
        i = 0;
-       for (chunk = pinned_chunk_list; chunk; chunk = chunk->next) {
+       for (chunk = pinned_chunk_list; chunk; chunk = chunk->block.next) {
                report_pinned_chunk (chunk, i++);
        }
 }
@@ -3049,20 +3850,22 @@ mark_pinned_from_addresses (PinnedChunk *chunk, void **start, void **end)
                /* if the vtable is inside the chunk it's on the freelist, so skip */
                if (*ptr && (*ptr < (void*)chunk->start_data || *ptr > (void*)((char*)chunk + chunk->num_pages * FREELIST_PAGESIZE))) {
                        pin_object (addr);
+                       if (heap_dump_file)
+                               pin_stats_register_object ((char*) addr, safe_object_get_size ((MonoObject*) addr));
                        DEBUG (6, fprintf (gc_debug_file, "Marked pinned object %p (%s) from roots\n", addr, safe_name (addr)));
                }
        }
 }
 
 static void
-scan_pinned_objects (void (*callback) (PinnedChunk*, char*, size_t, void*), void *callback_data)
+scan_pinned_objects (ScanPinnedObjectCallbackFunc callback, void *callback_data)
 {
        PinnedChunk *chunk;
        int i, obj_size;
        char *p, *endp;
        void **ptr;
        void *end_chunk;
-       for (chunk = pinned_chunk_list; chunk; chunk = chunk->next) {
+       for (chunk = pinned_chunk_list; chunk; chunk = chunk->block.next) {
                end_chunk = (char*)chunk + chunk->num_pages * FREELIST_PAGESIZE;
                DEBUG (6, fprintf (gc_debug_file, "Scanning pinned chunk %p (range: %p-%p)\n", chunk, chunk->start_data, end_chunk));
                for (i = 0; i < chunk->num_pages; ++i) {
@@ -3115,7 +3918,7 @@ static void
 scan_from_pinned_objects (char *addr_start, char *addr_end)
 {
        char *data [2] = { addr_start, addr_end };
-       scan_pinned_objects (scan_object_callback, data);
+       scan_pinned_objects ((ScanPinnedObjectCallbackFunc)scan_object_callback, data);
 }
 
 /*
@@ -3159,19 +3962,18 @@ build_freelist (PinnedChunk *chunk, int slot, int size, char *start_page, char *
 }
 
 static PinnedChunk*
-alloc_pinned_chunk (size_t size)
+alloc_pinned_chunk (void)
 {
        PinnedChunk *chunk;
        int offset;
+       int size = MAJOR_SECTION_SIZE;
+
+       chunk = get_os_memory_aligned (size, TRUE);
+       chunk->block.role = MEMORY_ROLE_PINNED;
 
-       size += pagesize; /* at least one page */
-       size += pagesize - 1;
-       size &= ~(pagesize - 1);
-       if (size < PINNED_CHUNK_MIN_SIZE * 2)
-               size = PINNED_CHUNK_MIN_SIZE * 2;
-       chunk = get_os_memory (size, TRUE);
        UPDATE_HEAP_BOUNDARIES (chunk, ((char*)chunk + size));
        total_alloc += size;
+       pinned_chunk_bytes_alloced += size;
 
        /* setup the bookeeping fields */
        chunk->num_pages = size / FREELIST_PAGESIZE;
@@ -3189,7 +3991,7 @@ alloc_pinned_chunk (size_t size)
        /* allocate the first page to the freelist */
        chunk->page_sizes [0] = PINNED_FIRST_SLOT_SIZE;
        build_freelist (chunk, slot_for_size (PINNED_FIRST_SLOT_SIZE), PINNED_FIRST_SLOT_SIZE, chunk->start_data, ((char*)chunk + FREELIST_PAGESIZE));
-       DEBUG (4, fprintf (gc_debug_file, "Allocated pinned chunk %p, size: %zd\n", chunk, size));
+       DEBUG (4, fprintf (gc_debug_file, "Allocated pinned chunk %p, size: %d\n", chunk, size));
        min_pinned_chunk_addr = MIN (min_pinned_chunk_addr, (char*)chunk->start_data);
        max_pinned_chunk_addr = MAX (max_pinned_chunk_addr, ((char*)chunk + size));
        return chunk;
@@ -3233,7 +4035,7 @@ alloc_from_freelist (size_t size)
        slot = slot_for_size (size);
        /*g_print ("using slot %d for size %d (slot size: %d)\n", slot, size, freelist_sizes [slot]);*/
        g_assert (size <= freelist_sizes [slot]);
-       for (pchunk = pinned_chunk_list; pchunk; pchunk = pchunk->next) {
+       for (pchunk = pinned_chunk_list; pchunk; pchunk = pchunk->block.next) {
                void **p = pchunk->free_list [slot];
                if (p) {
                        /*g_print ("found freelist for slot %d in chunk %p, returning %p, next %p\n", slot, pchunk, p, *p);*/
@@ -3241,14 +4043,14 @@ alloc_from_freelist (size_t size)
                        return p;
                }
        }
-       for (pchunk = pinned_chunk_list; pchunk; pchunk = pchunk->next) {
+       for (pchunk = pinned_chunk_list; pchunk; pchunk = pchunk->block.next) {
                res = get_chunk_freelist (pchunk, slot);
                if (res)
                        return res;
        }
-       pchunk = alloc_pinned_chunk (size);
+       pchunk = alloc_pinned_chunk ();
        /* FIXME: handle OOM */
-       pchunk->next = pinned_chunk_list;
+       pchunk->block.next = pinned_chunk_list;
        pinned_chunk_list = pchunk;
        res = get_chunk_freelist (pchunk, slot);
        return res;
@@ -3259,43 +4061,62 @@ alloc_from_freelist (size_t size)
  * in the chunk.
  */
 static void*
-get_internal_mem (size_t size)
+get_internal_mem (size_t size, int type)
 {
-       return calloc (1, size);
-#if 0
        int slot;
        void *res = NULL;
        PinnedChunk *pchunk;
+
+       if (size > freelist_sizes [FREELIST_NUM_SLOTS - 1]) {
+               LargeInternalMemHeader *mh;
+
+               size += sizeof (LargeInternalMemHeader);
+               mh = get_os_memory (size, TRUE);
+               mh->magic = LARGE_INTERNAL_MEM_HEADER_MAGIC;
+               mh->size = size;
+
+               large_internal_bytes_alloced += size;
+
+               return mh->data;
+       }
+
        slot = slot_for_size (size);
        g_assert (size <= freelist_sizes [slot]);
-       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->next) {
+
+       small_internal_mem_bytes [type] += freelist_sizes [slot];
+
+       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->block.next) {
                void **p = pchunk->free_list [slot];
                if (p) {
                        pchunk->free_list [slot] = *p;
+                       memset (p, 0, size);
                        return p;
                }
        }
-       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->next) {
+       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->block.next) {
                res = get_chunk_freelist (pchunk, slot);
-               if (res)
+               if (res) {
+                       memset (res, 0, size);
                        return res;
+               }
        }
-       pchunk = alloc_pinned_chunk (size);
+       pchunk = alloc_pinned_chunk ();
        /* FIXME: handle OOM */
-       pchunk->next = internal_chunk_list;
+       pchunk->block.next = internal_chunk_list;
        internal_chunk_list = pchunk;
        res = get_chunk_freelist (pchunk, slot);
+       memset (res, 0, size);
        return res;
-#endif
 }
 
 static void
-free_internal_mem (void *addr)
+free_internal_mem (void *addr, int type)
 {
-       free (addr);
-#if 0
        PinnedChunk *pchunk;
-       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->next) {
+       LargeInternalMemHeader *mh;
+       if (!addr)
+               return;
+       for (pchunk = internal_chunk_list; pchunk; pchunk = pchunk->block.next) {
                /*printf ("trying to free %p in %p (pages: %d)\n", addr, pchunk, pchunk->num_pages);*/
                if (addr >= (void*)pchunk && (char*)addr < (char*)pchunk + pchunk->num_pages * FREELIST_PAGESIZE) {
                        int offset = (char*)addr - (char*)pchunk;
@@ -3304,12 +4125,16 @@ free_internal_mem (void *addr)
                        void **p = addr;
                        *p = pchunk->free_list [slot];
                        pchunk->free_list [slot] = p;
+
+                       small_internal_mem_bytes [type] -= freelist_sizes [slot];
+
                        return;
                }
        }
-       printf ("free of %p failed\n", addr);
-       g_assert_not_reached ();
-#endif
+       mh = (LargeInternalMemHeader*)((char*)addr - G_STRUCT_OFFSET (LargeInternalMemHeader, data));
+       g_assert (mh->magic == LARGE_INTERNAL_MEM_HEADER_MAGIC);
+       large_internal_bytes_alloced -= mh->size;
+       free_os_memory (mh, mh->size);
 }
 
 /*
@@ -3353,11 +4178,13 @@ alloc_large_inner (MonoVTable *vtable, size_t size)
        size_t alloc_size;
        int just_did_major_gc = FALSE;
 
+       g_assert (size > MAX_SMALL_OBJ_SIZE);
+
        if (los_memory_usage > next_los_collection) {
                DEBUG (4, fprintf (gc_debug_file, "Should trigger major collection: req size %zd (los already: %zu, limit: %zu)\n", size, los_memory_usage, next_los_collection));
                just_did_major_gc = TRUE;
                stop_world ();
-               major_collection ();
+               major_collection ("LOS overflow");
                restart_world ();
                /* later increase based on a percent of the heap size */
                next_los_collection = los_memory_usage + 5*1024*1024;
@@ -3424,14 +4251,16 @@ alloc_degraded (MonoVTable *vtable, size_t size)
 {
        GCMemSection *section;
        void **p = NULL;
-       for (section = section_list; section; section = section->next) {
+       g_assert (size <= MAX_SMALL_OBJ_SIZE);
+       for (section = section_list; section; section = section->block.next) {
                if (section != nursery_section && (section->end_data - section->next_data) >= size) {
                        p = (void**)section->next_data;
                        break;
                }
        }
        if (!p) {
-               section = alloc_section (nursery_section->size * 4);
+               section = alloc_major_section ();
+               section->is_to_space = FALSE;
                /* FIXME: handle OOM */
                p = (void**)section->next_data;
        }
@@ -3450,26 +4279,24 @@ alloc_degraded (MonoVTable *vtable, size_t size)
  * so when we scan the thread stacks for pinned objects, we can start
  * a search for the pinned object in SCAN_START_SIZE chunks.
  */
-void*
-mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
+static void*
+mono_gc_alloc_obj_nolock (MonoVTable *vtable, size_t size)
 {
        /* FIXME: handle OOM */
        void **p;
        char *new_next;
-       int dummy;
        gboolean res;
+       TLAB_ACCESS_INIT;
+
+       HEAVY_STAT (++stat_objects_alloced);
+
        size += ALLOC_ALIGN - 1;
        size &= ~(ALLOC_ALIGN - 1);
 
        g_assert (vtable->gc_descr);
 
        if (G_UNLIKELY (collect_before_allocs)) {
-               int dummy;
-
                if (nursery_section) {
-                       LOCK_GC;
-
-                       update_current_thread_stack (&dummy);
                        stop_world ();
                        collect_nursery (0);
                        restart_world ();
@@ -3477,50 +4304,60 @@ mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
                                // FIXME:
                                g_assert_not_reached ();
                        }
-                       UNLOCK_GC;
                }
        }
 
-       /* tlab_next and tlab_temp_end are TLS vars so accessing them might be expensive */
+       /*
+        * We must already have the lock here instead of after the
+        * fast path because we might be interrupted in the fast path
+        * (after confirming that new_next < TLAB_TEMP_END) by the GC,
+        * and we'll end up allocating an object in a fragment which
+        * no longer belongs to us.
+        *
+        * The managed allocator does not do this, but it's treated
+        * specially by the world-stopping code.
+        */
 
-       p = (void**)tlab_next;
-       /* FIXME: handle overflow */
-       new_next = (char*)p + size;
-       tlab_next = new_next;
+       if (size > MAX_SMALL_OBJ_SIZE) {
+               p = alloc_large_inner (vtable, size);
+       } else {
+               /* tlab_next and tlab_temp_end are TLS vars so accessing them might be expensive */
 
-       if (G_LIKELY (new_next < tlab_temp_end)) {
-               /* Fast path */
+               p = (void**)TLAB_NEXT;
+               /* FIXME: handle overflow */
+               new_next = (char*)p + size;
+               TLAB_NEXT = new_next;
 
-               /* 
-                * FIXME: We might need a memory barrier here so the change to tlab_next is 
-                * visible before the vtable store.
-                */
+               if (G_LIKELY (new_next < TLAB_TEMP_END)) {
+                       /* Fast path */
 
-               DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
-               *p = vtable;
-               
-               return p;
-       }
+                       /* 
+                        * FIXME: We might need a memory barrier here so the change to tlab_next is 
+                        * visible before the vtable store.
+                        */
 
-       /* Slow path */
+                       DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
+                       g_assert (*p == NULL);
+                       *p = vtable;
 
-       /* there are two cases: the object is too big or we run out of space in the TLAB */
-       /* we also reach here when the thread does its first allocation after a minor 
-        * collection, since the tlab_ variables are initialized to NULL.
-        * there can be another case (from ORP), if we cooperate with the runtime a bit:
-        * objects that need finalizers can have the high bit set in their size
-        * so the above check fails and we can readily add the object to the queue.
-        * This avoids taking again the GC lock when registering, but this is moot when
-        * doing thread-local allocation, so it may not be a good idea.
-        */
-       LOCK_GC;
-       if (size > MAX_SMALL_OBJ_SIZE) {
-               /* get ready for possible collection */
-               update_current_thread_stack (&dummy);
-               tlab_next -= size;
-               p = alloc_large_inner (vtable, size);
-       } else {
-               if (tlab_next >= tlab_real_end) {
+                       g_assert (TLAB_NEXT == new_next);
+
+                       return p;
+               }
+
+               /* Slow path */
+
+               /* there are two cases: the object is too big or we run out of space in the TLAB */
+               /* we also reach here when the thread does its first allocation after a minor 
+                * collection, since the tlab_ variables are initialized to NULL.
+                * there can be another case (from ORP), if we cooperate with the runtime a bit:
+                * objects that need finalizers can have the high bit set in their size
+                * so the above check fails and we can readily add the object to the queue.
+                * This avoids taking again the GC lock when registering, but this is moot when
+                * doing thread-local allocation, so it may not be a good idea.
+                */
+               g_assert (TLAB_NEXT == new_next);
+               if (TLAB_NEXT >= TLAB_REAL_END) {
                        /* 
                         * Run out of space in the TLAB. When this happens, some amount of space
                         * remains in the TLAB, but not enough to satisfy the current allocation
@@ -3528,13 +4365,12 @@ mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
                         * keep it if the remaining space is above a treshold, and satisfy the
                         * allocation directly from the nursery.
                         */
-                       tlab_next -= size;
+                       TLAB_NEXT -= size;
                        /* when running in degraded mode, we continue allocing that way
                         * for a while, to decrease the number of useless nursery collections.
                         */
                        if (degraded_mode && degraded_mode < DEFAULT_NURSERY_SIZE) {
                                p = alloc_degraded (vtable, size);
-                               UNLOCK_GC;
                                return p;
                        }
 
@@ -3542,12 +4378,9 @@ mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
                                /* Allocate directly from the nursery */
                                if (nursery_next + size >= nursery_frag_real_end) {
                                        if (!search_fragment_for_size (size)) {
-                                               /* get ready for possible collection */
-                                               update_current_thread_stack (&dummy);
                                                minor_collect_or_expand_inner (size);
                                                if (degraded_mode) {
                                                        p = alloc_degraded (vtable, size);
-                                                       UNLOCK_GC;
                                                        return p;
                                                }
                                        }
@@ -3563,37 +4396,34 @@ mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
                                if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION)
                                        memset (p, 0, size);
                        } else {
-                               if (tlab_start)
-                                       DEBUG (3, fprintf (gc_debug_file, "Retire TLAB: %p-%p [%ld]\n", tlab_start, tlab_real_end, (long)(tlab_real_end - tlab_next - size)));
+                               if (TLAB_START)
+                                       DEBUG (3, fprintf (gc_debug_file, "Retire TLAB: %p-%p [%ld]\n", TLAB_START, TLAB_REAL_END, (long)(TLAB_REAL_END - TLAB_NEXT - size)));
 
                                if (nursery_next + tlab_size >= nursery_frag_real_end) {
                                        res = search_fragment_for_size (tlab_size);
                                        if (!res) {
-                                               /* get ready for possible collection */
-                                               update_current_thread_stack (&dummy);
                                                minor_collect_or_expand_inner (tlab_size);
                                                if (degraded_mode) {
                                                        p = alloc_degraded (vtable, size);
-                                                       UNLOCK_GC;
                                                        return p;
                                                }
                                        }
                                }
 
                                /* Allocate a new TLAB from the current nursery fragment */
-                               tlab_start = nursery_next;
+                               TLAB_START = nursery_next;
                                nursery_next += tlab_size;
-                               tlab_next = tlab_start;
-                               tlab_real_end = tlab_start + tlab_size;
-                               tlab_temp_end = tlab_start + MIN (SCAN_START_SIZE, tlab_size);
+                               TLAB_NEXT = TLAB_START;
+                               TLAB_REAL_END = TLAB_START + tlab_size;
+                               TLAB_TEMP_END = TLAB_START + MIN (SCAN_START_SIZE, tlab_size);
 
                                if (nursery_clear_policy == CLEAR_AT_TLAB_CREATION)
-                                       memset (tlab_start, 0, tlab_size);
+                                       memset (TLAB_START, 0, tlab_size);
 
                                /* Allocate from the TLAB */
-                               p = (void*)tlab_next;
-                               tlab_next += size;
-                               g_assert (tlab_next <= tlab_real_end);
+                               p = (void*)TLAB_NEXT;
+                               TLAB_NEXT += size;
+                               g_assert (TLAB_NEXT <= TLAB_REAL_END);
 
                                nursery_section->scan_starts [((char*)p - (char*)nursery_section->data)/SCAN_START_SIZE] = (char*)p;
                        }
@@ -3603,17 +4433,74 @@ mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
                        /* record the scan start so we can find pinned objects more easily */
                        nursery_section->scan_starts [((char*)p - (char*)nursery_section->data)/SCAN_START_SIZE] = (char*)p;
                        /* we just bump tlab_temp_end as well */
-                       tlab_temp_end = MIN (tlab_real_end, tlab_next + SCAN_START_SIZE);
-                       DEBUG (5, fprintf (gc_debug_file, "Expanding local alloc: %p-%p\n", tlab_next, tlab_temp_end));
+                       TLAB_TEMP_END = MIN (TLAB_REAL_END, TLAB_NEXT + SCAN_START_SIZE);
+                       DEBUG (5, fprintf (gc_debug_file, "Expanding local alloc: %p-%p\n", TLAB_NEXT, TLAB_TEMP_END));
                }
        }
 
        DEBUG (6, fprintf (gc_debug_file, "Allocated object %p, vtable: %p (%s), size: %zd\n", p, vtable, vtable->klass->name, size));
        *p = vtable;
 
+       return p;
+}
+
+void*
+mono_gc_alloc_obj (MonoVTable *vtable, size_t size)
+{
+       void *res;
+       LOCK_GC;
+       res = mono_gc_alloc_obj_nolock (vtable, size);
+       UNLOCK_GC;
+       return res;
+}
+
+void*
+mono_gc_alloc_vector (MonoVTable *vtable, size_t size, mono_array_size_t max_length)
+{
+       MonoArray *arr;
+
+       LOCK_GC;
+
+       arr = mono_gc_alloc_obj_nolock (vtable, size);
+       arr->max_length = max_length;
+
        UNLOCK_GC;
 
-       return p;
+       return arr;
+}
+
+void*
+mono_gc_alloc_array (MonoVTable *vtable, size_t size, mono_array_size_t max_length, mono_array_size_t bounds_size)
+{
+       MonoArray *arr;
+       MonoArrayBounds *bounds;
+
+       LOCK_GC;
+
+       arr = mono_gc_alloc_obj_nolock (vtable, size);
+       arr->max_length = max_length;
+
+       bounds = (MonoArrayBounds*)((char*)arr + size - bounds_size);
+       arr->bounds = bounds;
+
+       UNLOCK_GC;
+
+       return arr;
+}
+
+void*
+mono_gc_alloc_string (MonoVTable *vtable, size_t size, gint32 len)
+{
+       MonoString *str;
+
+       LOCK_GC;
+
+       str = mono_gc_alloc_obj_nolock (vtable, size);
+       str->length = len;
+
+       UNLOCK_GC;
+
+       return str;
 }
 
 /*
@@ -3629,7 +4516,6 @@ mono_gc_alloc_pinned_obj (MonoVTable *vtable, size_t size)
        size &= ~(ALLOC_ALIGN - 1);
        LOCK_GC;
        if (size > MAX_FREELIST_SIZE) {
-               update_current_thread_stack (&p);
                /* large objects are always pinned anyway */
                p = alloc_large_inner (vtable, size);
        } else {
@@ -3681,18 +4567,59 @@ queue_finalization_entry (FinalizeEntry *entry) {
        }
 }
 
+/* LOCKING: requires that the GC lock is held */
+static void
+rehash_fin_table (FinalizeEntryHashTable *hash_table)
+{
+       FinalizeEntry **finalizable_hash = hash_table->table;
+       mword finalizable_hash_size = hash_table->size;
+       int i;
+       unsigned int hash;
+       FinalizeEntry **new_hash;
+       FinalizeEntry *entry, *next;
+       int new_size = g_spaced_primes_closest (hash_table->num_registered);
+
+       new_hash = get_internal_mem (new_size * sizeof (FinalizeEntry*), INTERNAL_MEM_FIN_TABLE);
+       for (i = 0; i < finalizable_hash_size; ++i) {
+               for (entry = finalizable_hash [i]; entry; entry = next) {
+                       hash = mono_object_hash (entry->object) % new_size;
+                       next = entry->next;
+                       entry->next = new_hash [hash];
+                       new_hash [hash] = entry;
+               }
+       }
+       free_internal_mem (finalizable_hash, INTERNAL_MEM_FIN_TABLE);
+       hash_table->table = new_hash;
+       hash_table->size = new_size;
+}
+
+/* LOCKING: requires that the GC lock is held */
+static void
+rehash_fin_table_if_necessary (FinalizeEntryHashTable *hash_table)
+{
+       if (hash_table->num_registered >= hash_table->size * 2)
+               rehash_fin_table (hash_table);
+}
+
+/* LOCKING: requires that the GC lock is held */
 static void
-finalize_in_range (char *start, char *end)
+finalize_in_range (char *start, char *end, int generation)
 {
+       FinalizeEntryHashTable *hash_table = get_finalize_entry_hash_table (generation);
        FinalizeEntry *entry, *prev;
        int i;
+       FinalizeEntry **finalizable_hash = hash_table->table;
+       mword finalizable_hash_size = hash_table->size;
+
        if (no_finalize)
                return;
        for (i = 0; i < finalizable_hash_size; ++i) {
                prev = NULL;
                for (entry = finalizable_hash [i]; entry;) {
-                       if ((char*)entry->object >= start && (char*)entry->object < end && ((char*)entry->object < to_space || (char*)entry->object >= to_space_end)) {
-                               if (object_is_fin_ready (entry->object)) {
+                       if ((char*)entry->object >= start && (char*)entry->object < end && !object_is_in_to_space (entry->object)) {
+                               gboolean is_fin_ready = object_is_fin_ready (entry->object);
+                               char *copy = copy_object (entry->object, start, end);
+                               if (is_fin_ready) {
                                        char *from;
                                        FinalizeEntry *next;
                                        /* remove and put in fin_ready_list */
@@ -3702,18 +4629,45 @@ finalize_in_range (char *start, char *end)
                                                finalizable_hash [i] = entry->next;
                                        next = entry->next;
                                        num_ready_finalizers++;
-                                       num_registered_finalizers--;
+                                       hash_table->num_registered--;
                                        queue_finalization_entry (entry);
                                        /* Make it survive */
                                        from = entry->object;
-                                       entry->object = copy_object (entry->object, start, end);
-                                       DEBUG (5, fprintf (gc_debug_file, "Queueing object for finalization: %p (%s) (was at %p) (%d/%d)\n", entry->object, safe_name (entry->object), from, num_ready_finalizers, num_registered_finalizers));
+                                       entry->object = copy;
+                                       DEBUG (5, fprintf (gc_debug_file, "Queueing object for finalization: %p (%s) (was at %p) (%d/%d)\n", entry->object, safe_name (entry->object), from, num_ready_finalizers, hash_table->num_registered));
                                        entry = next;
                                        continue;
                                } else {
-                                       /* update pointer */
-                                       DEBUG (5, fprintf (gc_debug_file, "Updating object for finalization: %p (%s)\n", entry->object, safe_name (entry->object)));
-                                       entry->object = copy_object (entry->object, start, end);
+                                       char *from = entry->object;
+                                       if (hash_table == &minor_finalizable_hash && !ptr_in_nursery (copy)) {
+                                               FinalizeEntry *next = entry->next;
+                                               unsigned int major_hash;
+                                               /* remove from the list */
+                                               if (prev)
+                                                       prev->next = entry->next;
+                                               else
+                                                       finalizable_hash [i] = entry->next;
+                                               hash_table->num_registered--;
+
+                                               entry->object = copy;
+
+                                               /* insert it into the major hash */
+                                               rehash_fin_table_if_necessary (&major_finalizable_hash);
+                                               major_hash = mono_object_hash ((MonoObject*) copy) %
+                                                       major_finalizable_hash.size;
+                                               entry->next = major_finalizable_hash.table [major_hash];
+                                               major_finalizable_hash.table [major_hash] = entry;
+                                               major_finalizable_hash.num_registered++;
+
+                                               DEBUG (5, fprintf (gc_debug_file, "Promoting finalization of object %p (%s) (was at %p) to major table\n", copy, safe_name (copy), from));
+
+                                               entry = next;
+                                               continue;
+                                       } else {
+                                               /* update pointer */
+                                               DEBUG (5, fprintf (gc_debug_file, "Updating object for finalization: %p (%s) (was at %p)\n", entry->object, safe_name (entry->object), from));
+                                               entry->object = copy;
+                                       }
                                }
                        }
                        prev = entry;
@@ -3722,17 +4676,24 @@ finalize_in_range (char *start, char *end)
        }
 }
 
+/* LOCKING: requires that the GC lock is held */
 static void
-null_link_in_range (char *start, char *end)
+null_link_in_range (char *start, char *end, int generation)
 {
+       DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
+       DisappearingLink **disappearing_link_hash = hash->table;
+       int disappearing_link_hash_size = hash->size;
        DisappearingLink *entry, *prev;
        int i;
+       if (!hash->num_links)
+               return;
        for (i = 0; i < disappearing_link_hash_size; ++i) {
                prev = NULL;
                for (entry = disappearing_link_hash [i]; entry;) {
                        char *object = DISLINK_OBJECT (entry);
-                       if (object >= start && object < end && (object < to_space || object >= to_space_end)) {
-                               if (!DISLINK_TRACK (entry) && object_is_fin_ready (object)) {
+                       if (object >= start && object < end && !object_is_in_to_space (object)) {
+                               gboolean track = DISLINK_TRACK (entry);
+                               if (!track && object_is_fin_ready (object)) {
                                        void **p = entry->link;
                                        DisappearingLink *old;
                                        *p = NULL;
@@ -3743,23 +4704,51 @@ null_link_in_range (char *start, char *end)
                                                disappearing_link_hash [i] = entry->next;
                                        DEBUG (5, fprintf (gc_debug_file, "Dislink nullified at %p to GCed object %p\n", p, object));
                                        old = entry->next;
-                                       free_internal_mem (entry);
+                                       free_internal_mem (entry, INTERNAL_MEM_DISLINK);
                                        entry = old;
-                                       num_disappearing_links--;
+                                       hash->num_links--;
                                        continue;
                                } else {
-                                       /* update pointer if it's moved
+                                       char *copy = copy_object (object, start, end);
+
+                                       /* Update pointer if it's moved.  If the object
+                                        * has been moved out of the nursery, we need to
+                                        * remove the link from the minor hash table to
+                                        * the major one.
+                                        *
                                         * FIXME: what if an object is moved earlier?
                                         */
-                                       /* We set the track
-                                        * resurrection bit to FALSE
-                                        * here so that the object can
-                                        * be collected in the next
-                                        * cycle (i.e. after it was
-                                        * finalized).
-                                        */
-                                       *entry->link = HIDE_POINTER (copy_object (object, start, end), FALSE);
-                                       DEBUG (5, fprintf (gc_debug_file, "Updated dislink at %p to %p\n", entry->link, DISLINK_OBJECT (entry)));
+
+                                       if (hash == &minor_disappearing_link_hash && !ptr_in_nursery (copy)) {
+                                               void **link = entry->link;
+                                               DisappearingLink *old;
+                                               /* remove from list */
+                                               if (prev)
+                                                       prev->next = entry->next;
+                                               else
+                                                       disappearing_link_hash [i] = entry->next;
+                                               old = entry->next;
+                                               free_internal_mem (entry, INTERNAL_MEM_DISLINK);
+                                               entry = old;
+                                               hash->num_links--;
+
+                                               add_or_remove_disappearing_link ((MonoObject*)copy, link,
+                                                       track, GENERATION_OLD);
+
+                                               DEBUG (5, fprintf (gc_debug_file, "Upgraded dislink at %p to major because object %p moved to %p\n", link, object, copy));
+
+                                               continue;
+                                       } else {
+                                               /* We set the track resurrection bit to
+                                                * FALSE if the object is to be finalized
+                                                * so that the object can be collected in
+                                                * the next cycle (i.e. after it was
+                                                * finalized).
+                                                */
+                                               *entry->link = HIDE_POINTER (copy,
+                                                       object_is_fin_ready (object) ? FALSE : track);
+                                               DEBUG (5, fprintf (gc_debug_file, "Updated dislink at %p to %p\n", entry->link, DISLINK_OBJECT (entry)));
+                                       }
                                }
                        }
                        prev = entry;
@@ -3768,16 +4757,24 @@ null_link_in_range (char *start, char *end)
        }
 }
 
+/* LOCKING: requires that the GC lock is held */
 static void
-null_links_for_domain (MonoDomain *domain)
+null_links_for_domain (MonoDomain *domain, int generation)
 {
+       DisappearingLinkHashTable *hash = get_dislink_hash_table (generation);
+       DisappearingLink **disappearing_link_hash = hash->table;
+       int disappearing_link_hash_size = hash->size;
        DisappearingLink *entry, *prev;
        int i;
        for (i = 0; i < disappearing_link_hash_size; ++i) {
                prev = NULL;
                for (entry = disappearing_link_hash [i]; entry; ) {
                        char *object = DISLINK_OBJECT (entry);
-                       if (object && mono_object_domain (object) == domain) {
+                       /* FIXME: actually there should be no object
+                          left in the domain with a non-null vtable
+                          (provided we remove the Thread special
+                          case) */
+                       if (object && (!((MonoObject*)object)->vtable || mono_object_domain (object) == domain)) {
                                DisappearingLink *next = entry->next;
 
                                if (prev)
@@ -3787,9 +4784,9 @@ null_links_for_domain (MonoDomain *domain)
 
                                if (*(entry->link)) {
                                        *(entry->link) = NULL;
-                                       g_warning ("Disappearing link not freed");
+                                       g_warning ("Disappearing link %p not freed", entry->link);
                                } else {
-                                       free_internal_mem (entry);
+                                       free_internal_mem (entry, INTERNAL_MEM_DISLINK);
                                }
 
                                entry = next;
@@ -3801,28 +4798,19 @@ null_links_for_domain (MonoDomain *domain)
        }
 }
 
-/**
- * mono_gc_finalizers_for_domain:
- * @domain: the unloading appdomain
- * @out_array: output array
- * @out_size: size of output array
- *
- * Store inside @out_array up to @out_size objects that belong to the unloading
- * appdomain @domain. Returns the number of stored items. Can be called repeteadly
- * until it returns 0.
- * The items are removed from the finalizer data structure, so the caller is supposed
- * to finalize them.
- * @out_array should be on the stack to allow the GC to know the objects are still alive.
- */
-int
-mono_gc_finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int out_size)
+/* LOCKING: requires that the GC lock is held */
+static int
+finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int out_size,
+       FinalizeEntryHashTable *hash_table)
 {
+       FinalizeEntry **finalizable_hash = hash_table->table;
+       mword finalizable_hash_size = hash_table->size;
        FinalizeEntry *entry, *prev;
        int i, count;
+
        if (no_finalize || !out_size || !out_array)
                return 0;
        count = 0;
-       LOCK_GC;
        for (i = 0; i < finalizable_hash_size; ++i) {
                prev = NULL;
                for (entry = finalizable_hash [i]; entry;) {
@@ -3834,50 +4822,56 @@ mono_gc_finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int o
                                else
                                        finalizable_hash [i] = entry->next;
                                next = entry->next;
-                               num_registered_finalizers--;
+                               hash_table->num_registered--;
                                out_array [count ++] = entry->object;
-                               DEBUG (5, fprintf (gc_debug_file, "Collecting object for finalization: %p (%s) (%d/%d)\n", entry->object, safe_name (entry->object), num_ready_finalizers, num_registered_finalizers));
+                               DEBUG (5, fprintf (gc_debug_file, "Collecting object for finalization: %p (%s) (%d/%d)\n", entry->object, safe_name (entry->object), num_ready_finalizers, hash_table->num_registered));
                                entry = next;
-                               if (count == out_size) {
-                                       UNLOCK_GC;
+                               if (count == out_size)
                                        return count;
-                               }
                                continue;
                        }
                        prev = entry;
                        entry = entry->next;
                }
        }
-       UNLOCK_GC;
        return count;
 }
 
-static void
-rehash_fin_table (void)
+/**
+ * mono_gc_finalizers_for_domain:
+ * @domain: the unloading appdomain
+ * @out_array: output array
+ * @out_size: size of output array
+ *
+ * Store inside @out_array up to @out_size objects that belong to the unloading
+ * appdomain @domain. Returns the number of stored items. Can be called repeteadly
+ * until it returns 0.
+ * The items are removed from the finalizer data structure, so the caller is supposed
+ * to finalize them.
+ * @out_array should be on the stack to allow the GC to know the objects are still alive.
+ */
+int
+mono_gc_finalizers_for_domain (MonoDomain *domain, MonoObject **out_array, int out_size)
 {
-       int i;
-       unsigned int hash;
-       FinalizeEntry **new_hash;
-       FinalizeEntry *entry, *next;
-       int new_size = g_spaced_primes_closest (num_registered_finalizers);
+       int result;
 
-       new_hash = get_internal_mem (new_size * sizeof (FinalizeEntry*));
-       for (i = 0; i < finalizable_hash_size; ++i) {
-               for (entry = finalizable_hash [i]; entry; entry = next) {
-                       hash = mono_object_hash (entry->object) % new_size;
-                       next = entry->next;
-                       entry->next = new_hash [hash];
-                       new_hash [hash] = entry;
-               }
+       LOCK_GC;
+       result = finalizers_for_domain (domain, out_array, out_size, &minor_finalizable_hash);
+       if (result < out_size) {
+               result += finalizers_for_domain (domain, out_array + result, out_size - result,
+                       &major_finalizable_hash);
        }
-       free_internal_mem (finalizable_hash);
-       finalizable_hash = new_hash;
-       finalizable_hash_size = new_size;
+       UNLOCK_GC;
+
+       return result;
 }
 
-void
-mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
+static void
+register_for_finalization (MonoObject *obj, void *user_data, int generation)
 {
+       FinalizeEntryHashTable *hash_table = get_finalize_entry_hash_table (generation);
+       FinalizeEntry **finalizable_hash;
+       mword finalizable_hash_size;
        FinalizeEntry *entry, *prev;
        unsigned int hash;
        if (no_finalize)
@@ -3885,8 +4879,9 @@ mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
        g_assert (user_data == NULL || user_data == mono_gc_run_finalize);
        hash = mono_object_hash (obj);
        LOCK_GC;
-       if (num_registered_finalizers >= finalizable_hash_size * 2)
-               rehash_fin_table ();
+       rehash_fin_table_if_necessary (hash_table);
+       finalizable_hash = hash_table->table;
+       finalizable_hash_size = hash_table->size;
        hash %= finalizable_hash_size;
        prev = NULL;
        for (entry = finalizable_hash [hash]; entry; entry = entry->next) {
@@ -3897,9 +4892,9 @@ mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
                                        prev->next = entry->next;
                                else
                                        finalizable_hash [hash] = entry->next;
-                               num_registered_finalizers--;
-                               DEBUG (5, fprintf (gc_debug_file, "Removed finalizer %p for object: %p (%s) (%d)\n", entry, obj, obj->vtable->klass->name, num_registered_finalizers));
-                               free_internal_mem (entry);
+                               hash_table->num_registered--;
+                               DEBUG (5, fprintf (gc_debug_file, "Removed finalizer %p for object: %p (%s) (%d)\n", entry, obj, obj->vtable->klass->name, hash_table->num_registered));
+                               free_internal_mem (entry, INTERNAL_MEM_FINALIZE_ENTRY);
                        }
                        UNLOCK_GC;
                        return;
@@ -3911,25 +4906,36 @@ mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
                UNLOCK_GC;
                return;
        }
-       entry = get_internal_mem (sizeof (FinalizeEntry));
+       entry = get_internal_mem (sizeof (FinalizeEntry), INTERNAL_MEM_FINALIZE_ENTRY);
        entry->object = obj;
        entry->next = finalizable_hash [hash];
        finalizable_hash [hash] = entry;
-       num_registered_finalizers++;
-       DEBUG (5, fprintf (gc_debug_file, "Added finalizer %p for object: %p (%s) (%d)\n", entry, obj, obj->vtable->klass->name, num_registered_finalizers));
+       hash_table->num_registered++;
+       DEBUG (5, fprintf (gc_debug_file, "Added finalizer %p for object: %p (%s) (%d) to %s table\n", entry, obj, obj->vtable->klass->name, hash_table->num_registered, generation_name (generation)));
        UNLOCK_GC;
 }
 
+void
+mono_gc_register_for_finalization (MonoObject *obj, void *user_data)
+{
+       if (ptr_in_nursery (obj))
+               register_for_finalization (obj, user_data, GENERATION_NURSERY);
+       else
+               register_for_finalization (obj, user_data, GENERATION_OLD);
+}
+
 static void
-rehash_dislink (void)
+rehash_dislink (DisappearingLinkHashTable *hash_table)
 {
+       DisappearingLink **disappearing_link_hash = hash_table->table;
+       int disappearing_link_hash_size = hash_table->size;
        int i;
        unsigned int hash;
        DisappearingLink **new_hash;
        DisappearingLink *entry, *next;
-       int new_size = g_spaced_primes_closest (num_disappearing_links);
+       int new_size = g_spaced_primes_closest (hash_table->num_links);
 
-       new_hash = get_internal_mem (new_size * sizeof (DisappearingLink*));
+       new_hash = get_internal_mem (new_size * sizeof (DisappearingLink*), INTERNAL_MEM_DISLINK_TABLE);
        for (i = 0; i < disappearing_link_hash_size; ++i) {
                for (entry = disappearing_link_hash [i]; entry; entry = next) {
                        hash = mono_aligned_addr_hash (entry->link) % new_size;
@@ -3938,20 +4944,26 @@ rehash_dislink (void)
                        new_hash [hash] = entry;
                }
        }
-       free_internal_mem (disappearing_link_hash);
-       disappearing_link_hash = new_hash;
-       disappearing_link_hash_size = new_size;
+       free_internal_mem (disappearing_link_hash, INTERNAL_MEM_DISLINK_TABLE);
+       hash_table->table = new_hash;
+       hash_table->size = new_size;
 }
 
+/* LOCKING: assumes the GC lock is held */
 static void
-mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track)
+add_or_remove_disappearing_link (MonoObject *obj, void **link, gboolean track, int generation)
 {
+       DisappearingLinkHashTable *hash_table = get_dislink_hash_table (generation);
        DisappearingLink *entry, *prev;
        unsigned int hash;
-       LOCK_GC;
+       DisappearingLink **disappearing_link_hash = hash_table->table;
+       int disappearing_link_hash_size = hash_table->size;
 
-       if (num_disappearing_links >= disappearing_link_hash_size * 2)
-               rehash_dislink ();
+       if (hash_table->num_links >= disappearing_link_hash_size * 2) {
+               rehash_dislink (hash_table);
+               disappearing_link_hash = hash_table->table;
+               disappearing_link_hash_size = hash_table->size;
+       }
        /* FIXME: add check that link is not in the heap */
        hash = mono_aligned_addr_hash (link) % disappearing_link_hash_size;
        entry = disappearing_link_hash [hash];
@@ -3965,26 +4977,40 @@ mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track
                                        prev->next = entry->next;
                                else
                                        disappearing_link_hash [hash] = entry->next;
-                               num_disappearing_links--;
-                               DEBUG (5, fprintf (gc_debug_file, "Removed dislink %p (%d)\n", entry, num_disappearing_links));
-                               free_internal_mem (entry);
+                               hash_table->num_links--;
+                               DEBUG (5, fprintf (gc_debug_file, "Removed dislink %p (%d) from %s table\n", entry, hash_table->num_links, generation_name (generation)));
+                               free_internal_mem (entry, INTERNAL_MEM_DISLINK);
                                *link = NULL;
                        } else {
                                *link = HIDE_POINTER (obj, track); /* we allow the change of object */
                        }
-                       UNLOCK_GC;
                        return;
                }
                prev = entry;
        }
-       entry = get_internal_mem (sizeof (DisappearingLink));
+       if (obj == NULL)
+               return;
+       entry = get_internal_mem (sizeof (DisappearingLink), INTERNAL_MEM_DISLINK);
        *link = HIDE_POINTER (obj, track);
        entry->link = link;
        entry->next = disappearing_link_hash [hash];
        disappearing_link_hash [hash] = entry;
-       num_disappearing_links++;
-       DEBUG (5, fprintf (gc_debug_file, "Added dislink %p for object: %p (%s) at %p\n", entry, obj, obj->vtable->klass->name, link));
-       UNLOCK_GC;
+       hash_table->num_links++;
+       DEBUG (5, fprintf (gc_debug_file, "Added dislink %p for object: %p (%s) at %p to %s table\n", entry, obj, obj->vtable->klass->name, link, generation_name (generation)));
+}
+
+/* LOCKING: assumes the GC lock is held */
+static void
+mono_gc_register_disappearing_link (MonoObject *obj, void **link, gboolean track)
+{
+       add_or_remove_disappearing_link (NULL, link, FALSE, GENERATION_NURSERY);
+       add_or_remove_disappearing_link (NULL, link, FALSE, GENERATION_OLD);
+       if (obj) {
+               if (ptr_in_nursery (obj))
+                       add_or_remove_disappearing_link (obj, link, track, GENERATION_NURSERY);
+               else
+                       add_or_remove_disappearing_link (obj, link, track, GENERATION_OLD);
+       }
 }
 
 int
@@ -4012,7 +5038,7 @@ mono_gc_invoke_finalizers (void)
                                        e = e->next;
                                e->next = entry->next;
                        }
-                       free_internal_mem (entry);
+                       free_internal_mem (entry, INTERNAL_MEM_FINALIZE_ENTRY);
                        entry = NULL;
                }
 
@@ -4082,7 +5108,7 @@ rehash_roots (gboolean pinned)
        int new_size;
 
        new_size = g_spaced_primes_closest (num_roots_entries [pinned]);
-       new_hash = get_internal_mem (new_size * sizeof (RootRecord*));
+       new_hash = get_internal_mem (new_size * sizeof (RootRecord*), INTERNAL_MEM_ROOTS_TABLE);
        for (i = 0; i < roots_hash_size [pinned]; ++i) {
                for (entry = roots_hash [pinned][i]; entry; entry = next) {
                        hash = mono_aligned_addr_hash (entry->start_root) % new_size;
@@ -4091,7 +5117,7 @@ rehash_roots (gboolean pinned)
                        new_hash [hash] = entry;
                }
        }
-       free_internal_mem (roots_hash [pinned]);
+       free_internal_mem (roots_hash [pinned], INTERNAL_MEM_ROOTS_TABLE);
        roots_hash [pinned] = new_hash;
        roots_hash_size [pinned] = new_size;
 }
@@ -4141,7 +5167,7 @@ mono_gc_register_root_inner (char *start, size_t size, void *descr, int root_typ
                        return TRUE;
                }
        }
-       new_root = get_internal_mem (sizeof (RootRecord));
+       new_root = get_internal_mem (sizeof (RootRecord), INTERNAL_MEM_ROOT_RECORD);
        if (new_root) {
                new_root->start_root = start;
                new_root->end_root = new_root->start_root + size;
@@ -4193,7 +5219,7 @@ mono_gc_deregister_root (char* addr)
                                roots_size -= (tmp->end_root - tmp->start_root);
                                num_roots_entries [root_type]--;
                                DEBUG (3, fprintf (gc_debug_file, "Removed root %p for range: %p-%p\n", tmp, tmp->start_root, tmp->end_root));
-                               free_internal_mem (tmp);
+                               free_internal_mem (tmp, INTERNAL_MEM_ROOT_RECORD);
                                break;
                        }
                        prev = tmp;
@@ -4209,25 +5235,6 @@ mono_gc_deregister_root (char* addr)
  * ######################################################################
  */
 
-/* eventually share with MonoThread? */
-typedef struct _SgenThreadInfo SgenThreadInfo;
-
-struct _SgenThreadInfo {
-       SgenThreadInfo *next;
-       ARCH_THREAD_TYPE id;
-       unsigned int stop_count; /* to catch duplicate signals */
-       int signal;
-       int skip;
-       void *stack_end;
-       void *stack_start;
-       char **tlab_next_addr;
-       char **tlab_start_addr;
-       char **tlab_temp_end_addr;
-       char **tlab_real_end_addr;
-       RememberedSet *remset;
-       gpointer runtime_data;
-};
-
 /* FIXME: handle large/small config */
 #define THREAD_HASH_SIZE 11
 #define HASH_PTHREAD_T(id) (((unsigned int)(id) >> 4) * 2654435761u)
@@ -4236,9 +5243,14 @@ static SgenThreadInfo* thread_table [THREAD_HASH_SIZE];
 
 #if USE_SIGNAL_BASED_START_STOP_WORLD
 
-static sem_t suspend_ack_semaphore;
+static MonoSemType suspend_ack_semaphore;
+static MonoSemType *suspend_ack_semaphore_ptr;
 static unsigned int global_stop_count = 0;
+#ifdef __APPLE__
+static int suspend_signal_num = SIGXFSZ;
+#else
 static int suspend_signal_num = SIGPWR;
+#endif
 static int restart_signal_num = SIGXCPU;
 static sigset_t suspend_signal_mask;
 static mword cur_thread_regs [ARCH_NUM_REGS] = {0};
@@ -4263,7 +5275,9 @@ update_current_thread_stack (void *start)
        void *ptr = cur_thread_regs;
        SgenThreadInfo *info = thread_info_lookup (ARCH_GET_THREAD ());
        info->stack_start = align_pointer (&ptr);
+       g_assert (info->stack_start >= info->stack_start_limit && info->stack_start < info->stack_end);
        ARCH_STORE_REGS (ptr);
+       info->stopped_regs = ptr;
        if (gc_callbacks.thread_suspend_func)
                gc_callbacks.thread_suspend_func (info->runtime_data, NULL);
 }
@@ -4278,6 +5292,36 @@ signal_desc (int signum)
        return "unknown";
 }
 
+/*
+ * Define this and use the "xdomain-checks" MONO_GC_DEBUG option to
+ * have cross-domain checks in the write barrier.
+ */
+//#define XDOMAIN_CHECKS_IN_WBARRIER
+
+#ifndef HEAVY_STATISTICS
+#define MANAGED_ALLOCATION
+#ifndef XDOMAIN_CHECKS_IN_WBARRIER
+#define MANAGED_WBARRIER
+#endif
+#endif
+
+static gboolean
+is_ip_in_managed_allocator (MonoDomain *domain, gpointer ip);
+
+static void
+wait_for_suspend_ack (int count)
+{
+       int i, result;
+
+       for (i = 0; i < count; ++i) {
+               while ((result = MONO_SEM_WAIT (suspend_ack_semaphore_ptr)) != 0) {
+                       if (errno != EINTR) {
+                               g_error ("sem_wait ()");
+                       }
+               }
+       }
+}
+
 /* LOCKING: assumes the GC lock is held */
 static int
 thread_handshake (int signum)
@@ -4307,14 +5351,81 @@ thread_handshake (int signum)
                }
        }
 
-       for (i = 0; i < count; ++i) {
-               while ((result = sem_wait (&suspend_ack_semaphore)) != 0) {
-                       if (errno != EINTR) {
-                               g_error ("sem_wait ()");
+       wait_for_suspend_ack (count);
+
+       return count;
+}
+
+static int
+restart_threads_until_none_in_managed_allocator (void)
+{
+       SgenThreadInfo *info;
+       int i, result, num_threads_died = 0;
+       int sleep_duration = -1;
+
+       for (;;) {
+               int restart_count = 0, restarted_count = 0;
+               /* restart all threads that stopped in the
+                  allocator */
+               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
+                       for (info = thread_table [i]; info; info = info->next) {
+                               if (info->skip)
+                                       continue;
+                               if (!info->stack_start ||
+                                               is_ip_in_managed_allocator (info->stopped_domain, info->stopped_ip)) {
+                                       result = pthread_kill (info->id, restart_signal_num);
+                                       if (result == 0) {
+                                               ++restart_count;
+                                       } else {
+                                               info->skip = 1;
+                                       }
+                               } else {
+                                       /* we set the stopped_ip to
+                                          NULL for threads which
+                                          we're not restarting so
+                                          that we can easily identify
+                                          the others */
+                                       info->stopped_ip = NULL;
+                                       info->stopped_domain = NULL;
+                               }
                        }
                }
+               /* if no threads were restarted, we're done */
+               if (restart_count == 0)
+                       break;
+
+               /* wait for the threads to signal their restart */
+               wait_for_suspend_ack (restart_count);
+
+               if (sleep_duration < 0) {
+                       sched_yield ();
+                       sleep_duration = 0;
+               } else {
+                       g_usleep (sleep_duration);
+                       sleep_duration += 10;
+               }
+
+               /* stop them again */
+               for (i = 0; i < THREAD_HASH_SIZE; ++i) {
+                       for (info = thread_table [i]; info; info = info->next) {
+                               if (info->skip || info->stopped_ip == NULL)
+                                       continue;
+                               result = pthread_kill (info->id, suspend_signal_num);
+                               if (result == 0) {
+                                       ++restarted_count;
+                               } else {
+                                       info->skip = 1;
+                               }
+                       }
+               }
+               /* some threads might have died */
+               num_threads_died += restart_count - restarted_count;
+               /* wait for the threads to signal their suspension
+                  again */
+               wait_for_suspend_ack (restart_count);
        }
-       return count;
+
+       return num_threads_died;
 }
 
 /* LOCKING: assumes the GC lock is held (by the stopping thread) */
@@ -4325,30 +5436,41 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        pthread_t id;
        int stop_count;
        int old_errno = errno;
+       gpointer regs [ARCH_NUM_REGS];
+       gpointer stack_start;
 
        id = pthread_self ();
        info = thread_info_lookup (id);
+       info->stopped_domain = mono_domain_get ();
+       info->stopped_ip = (gpointer) ARCH_SIGCTX_IP (context);
        stop_count = global_stop_count;
        /* duplicate signal */
        if (0 && info->stop_count == stop_count) {
                errno = old_errno;
                return;
        }
+#ifdef HAVE_KW_THREAD
        /* update the remset info in the thread data structure */
        info->remset = remembered_set;
-       /* 
-        * this includes the register values that the kernel put on the stack.
-        * Write arch-specific code to only push integer regs and a more accurate
-        * stack pointer.
-        */
-       info->stack_start = align_pointer (&id);
+#endif
+       stack_start = (char*) ARCH_SIGCTX_SP (context) - REDZONE_SIZE;
+       /* If stack_start is not within the limits, then don't set it
+          in info and we will be restarted. */
+       if (stack_start >= info->stack_start_limit && info->stack_start <= info->stack_end) {
+               info->stack_start = stack_start;
+
+               ARCH_COPY_SIGCTX_REGS (regs, context);
+               info->stopped_regs = regs;
+       } else {
+               g_assert (!info->stack_start);
+       }
 
        /* Notify the JIT */
        if (gc_callbacks.thread_suspend_func)
                gc_callbacks.thread_suspend_func (info->runtime_data, context);
 
        /* notify the waiting thread */
-       sem_post (&suspend_ack_semaphore);
+       MONO_SEM_POST (suspend_ack_semaphore_ptr);
        info->stop_count = stop_count;
 
        /* wait until we receive the restart signal */
@@ -4358,8 +5480,8 @@ suspend_handler (int sig, siginfo_t *siginfo, void *context)
        } while (info->signal != restart_signal_num);
 
        /* notify the waiting thread */
-       sem_post (&suspend_ack_semaphore);
-       
+       MONO_SEM_POST (suspend_ack_semaphore_ptr);
+
        errno = old_errno;
 }
 
@@ -4384,10 +5506,14 @@ stop_world (void)
 {
        int count;
 
+       update_current_thread_stack (&count);
+
        global_stop_count++;
        DEBUG (3, fprintf (gc_debug_file, "stopping world n %d from %p %p\n", global_stop_count, thread_info_lookup (ARCH_GET_THREAD ()), (gpointer)ARCH_GET_THREAD ()));
        TV_GETTIME (stop_world_time);
        count = thread_handshake (suspend_signal_num);
+       count -= restart_threads_until_none_in_managed_allocator ();
+       g_assert (count >= 0);
        DEBUG (3, fprintf (gc_debug_file, "world stopped %d thread(s)\n", count));
        return count;
 }
@@ -4396,10 +5522,18 @@ stop_world (void)
 static int
 restart_world (void)
 {
-       int count;
+       int count, i;
+       SgenThreadInfo *info;
        TV_DECLARE (end_sw);
        unsigned long usec;
 
+       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
+               for (info = thread_table [i]; info; info = info->next) {
+                       info->stack_start = NULL;
+                       info->stopped_regs = NULL;
+               }
+       }
+
        count = thread_handshake (restart_signal_num);
        TV_GETTIME (end_sw);
        usec = TV_ELAPSED (stop_world_time, end_sw);
@@ -4422,7 +5556,7 @@ static void *scan_area_arg_start, *scan_area_arg_end;
 void
 mono_gc_conservatively_scan_area (void *start, void *end)
 {
-       conservatively_pin_objects_from (start, end, scan_area_arg_start, scan_area_arg_end);
+       conservatively_pin_objects_from (start, end, scan_area_arg_start, scan_area_arg_end, PIN_TYPE_STACK);
 }
 
 void*
@@ -4449,16 +5583,17 @@ scan_thread_data (void *start_nursery, void *end_nursery, gboolean precise)
                                DEBUG (2, fprintf (gc_debug_file, "Skipping dead thread %p, range: %p-%p, size: %zd\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start));
                                continue;
                        }
-                       DEBUG (2, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %zd, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
-                       if (gc_callbacks.thread_mark_func)
+                       DEBUG (3, fprintf (gc_debug_file, "Scanning thread %p, range: %p-%p, size: %zd, pinned=%d\n", info, info->stack_start, info->stack_end, (char*)info->stack_end - (char*)info->stack_start, next_pin_slot));
+                       if (gc_callbacks.thread_mark_func && !conservative_stack_mark)
                                gc_callbacks.thread_mark_func (info->runtime_data, info->stack_start, info->stack_end, precise);
                        else if (!precise)
-                               conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery);
+                               conservatively_pin_objects_from (info->stack_start, info->stack_end, start_nursery, end_nursery, PIN_TYPE_STACK);
+
+                       if (!precise)
+                               conservatively_pin_objects_from (info->stopped_regs, info->stopped_regs + ARCH_NUM_REGS,
+                                               start_nursery, end_nursery, PIN_TYPE_STACK);
                }
        }
-       DEBUG (2, fprintf (gc_debug_file, "Scanning current thread registers, pinned=%d\n", next_pin_slot));
-       if (!precise)
-               conservatively_pin_objects_from ((void*)cur_thread_regs, (void*)(cur_thread_regs + ARCH_NUM_REGS), start_nursery, end_nursery);
 }
 
 static void
@@ -4479,20 +5614,21 @@ find_pinning_ref_from_thread (char *obj, size_t size)
                                }
                                start++;
                        }
+
+                       /* FIXME: check info->stopped_regs */
                }
        }
-       /* FIXME: check register */
 }
 
-/* return TRUE if ptr points inside the managed heap */
 static gboolean
-ptr_in_heap (void* ptr)
+ptr_on_stack (void *ptr)
 {
-       mword p = (mword)ptr;
-       if (p < lowest_heap_address || p >= highest_heap_address)
-               return FALSE;
-       /* FIXME: more checks */
-       return TRUE;
+       gpointer stack_start = &stack_start;
+       SgenThreadInfo *info = thread_info_lookup (ARCH_GET_THREAD ());
+
+       if (ptr >= stack_start && ptr < (gpointer)info->stack_end)
+               return TRUE;
+       return FALSE;
 }
 
 static mword*
@@ -4502,12 +5638,15 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
        mword count;
        mword desc;
 
+       if (global)
+               HEAVY_STAT (++stat_global_remsets_processed);
+
        /* FIXME: exclude stack locations */
        switch ((*p) & REMSET_TYPE_MASK) {
        case REMSET_LOCATION:
                ptr = (void**)(*p);
                //__builtin_prefetch (ptr);
-               if (((void*)ptr < start_nursery || (void*)ptr >= end_nursery) && ptr_in_heap (ptr)) {
+               if (((void*)ptr < start_nursery || (void*)ptr >= end_nursery)) {
                        *ptr = copy_object (*ptr, start_nursery, end_nursery);
                        DEBUG (9, fprintf (gc_debug_file, "Overwrote remset at %p with %p\n", ptr, *ptr));
                        if (!global && *ptr >= start_nursery && *ptr < end_nursery) {
@@ -4524,7 +5663,7 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                return p + 1;
        case REMSET_RANGE:
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
-               if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery) || !ptr_in_heap (ptr))
+               if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery))
                        return p + 2;
                count = p [1];
                while (count-- > 0) {
@@ -4537,20 +5676,22 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
                return p + 2;
        case REMSET_OBJECT:
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
-               if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery) || !ptr_in_heap (ptr))
+               if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery))
                        return p + 1;
-               scan_object (*ptr, start_nursery, end_nursery);
+               scan_object ((char*)ptr, start_nursery, end_nursery);
                return p + 1;
        case REMSET_OTHER: {
                ptr = (void**)(*p & ~REMSET_TYPE_MASK);
 
                switch (p [1]) {
                case REMSET_VTYPE:
-                       if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery) || !ptr_in_heap (ptr))
-                               return p + 3;
+                       if (((void*)ptr >= start_nursery && (void*)ptr < end_nursery))
+                               return p + 4;
                        desc = p [2];
-                       scan_vtype ((char*)ptr, desc, start_nursery, end_nursery);
-                       return p + 3;
+                       count = p [3];
+                       while (count-- > 0)
+                               ptr = (void**) scan_vtype ((char*)ptr, desc, start_nursery, end_nursery);
+                       return p + 4;
                case REMSET_ROOT_LOCATION:
                        /* Same as REMSET_LOCATION, but the address is not required to be in the heap */
                        *ptr = copy_object (*ptr, start_nursery, end_nursery);
@@ -4575,14 +5716,128 @@ handle_remset (mword *p, void *start_nursery, void *end_nursery, gboolean global
        return NULL;
 }
 
+#ifdef HEAVY_STATISTICS
+static mword*
+collect_store_remsets (RememberedSet *remset, mword *bumper)
+{
+       mword *p = remset->data;
+       mword last = 0;
+       mword last1 = 0;
+       mword last2 = 0;
+
+       while (p < remset->store_next) {
+               switch ((*p) & REMSET_TYPE_MASK) {
+               case REMSET_LOCATION:
+                       *bumper++ = *p;
+                       if (*p == last)
+                               ++stat_saved_remsets_1;
+                       last = *p;
+                       if (*p == last1 || *p == last2) {
+                               ++stat_saved_remsets_2;
+                       } else {
+                               last2 = last1;
+                               last1 = *p;
+                       }
+                       p += 1;
+                       break;
+               case REMSET_RANGE:
+                       p += 2;
+                       break;
+               case REMSET_OBJECT:
+                       p += 1;
+                       break;
+               case REMSET_OTHER:
+                       switch (p [1]) {
+                       case REMSET_VTYPE:
+                               p += 4;
+                               break;
+                       case REMSET_ROOT_LOCATION:
+                               p += 2;
+                               break;
+                       default:
+                               g_assert_not_reached ();
+                       }
+                       break;
+               default:
+                       g_assert_not_reached ();
+               }
+       }
+
+       return bumper;
+}
+
+static void
+remset_stats (void)
+{
+       RememberedSet *remset;
+       int size = 0;
+       SgenThreadInfo *info;
+       int i;
+       mword *addresses, *bumper, *p, *r;
+
+       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
+               for (info = thread_table [i]; info; info = info->next) {
+                       for (remset = info->remset; remset; remset = remset->next)
+                               size += remset->store_next - remset->data;
+               }
+       }
+       for (remset = freed_thread_remsets; remset; remset = remset->next)
+               size += remset->store_next - remset->data;
+       for (remset = global_remset; remset; remset = remset->next)
+               size += remset->store_next - remset->data;
+
+       bumper = addresses = get_internal_mem (sizeof (mword) * size, INTERNAL_MEM_STATISTICS);
+
+       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
+               for (info = thread_table [i]; info; info = info->next) {
+                       for (remset = info->remset; remset; remset = remset->next)
+                               bumper = collect_store_remsets (remset, bumper);
+               }
+       }
+       for (remset = global_remset; remset; remset = remset->next)
+               bumper = collect_store_remsets (remset, bumper);
+       for (remset = freed_thread_remsets; remset; remset = remset->next)
+               bumper = collect_store_remsets (remset, bumper);
+
+       g_assert (bumper <= addresses + size);
+
+       stat_store_remsets += bumper - addresses;
+
+       sort_addresses ((void**)addresses, bumper - addresses);
+       p = addresses;
+       r = addresses + 1;
+       while (r < bumper) {
+               if (*r != *p)
+                       *++p = *r;
+               ++r;
+       }
+
+       stat_store_remsets_unique += p - addresses;
+
+       free_internal_mem (addresses, INTERNAL_MEM_STATISTICS);
+}
+#endif
+
+static void
+clear_thread_store_remset_buffer (SgenThreadInfo *info)
+{
+       *info->store_remset_buffer_index_addr = 0;
+       memset (*info->store_remset_buffer_addr, 0, sizeof (gpointer) * STORE_REMSET_BUFFER_SIZE);
+}
+
 static void
 scan_from_remsets (void *start_nursery, void *end_nursery)
 {
        int i;
        SgenThreadInfo *info;
-       RememberedSet *remset, *next;
+       RememberedSet *remset;
+       GenericStoreRememberedSet *store_remset;
        mword *p, *next_p, *store_pos;
 
+#ifdef HEAVY_STATISTICS
+       remset_stats ();
+#endif
+
        /* the global one */
        for (remset = global_remset; remset; remset = remset->next) {
                DEBUG (4, fprintf (gc_debug_file, "Scanning global remset range: %p-%p, size: %zd\n", remset->data, remset->store_next, remset->store_next - remset->data));
@@ -4615,9 +5870,28 @@ scan_from_remsets (void *start_nursery, void *end_nursery)
                remset->store_next = store_pos;
        }
 
+       /* the generic store ones */
+       store_remset = generic_store_remsets;
+       while (store_remset) {
+               GenericStoreRememberedSet *next = store_remset->next;
+
+               for (i = 0; i < STORE_REMSET_BUFFER_SIZE - 1; ++i) {
+                       gpointer addr = store_remset->data [i];
+                       if (addr)
+                               handle_remset ((mword*)&addr, start_nursery, end_nursery, FALSE);
+               }
+
+               free_internal_mem (store_remset, INTERNAL_MEM_STORE_REMSET);
+
+               store_remset = next;
+       }
+       generic_store_remsets = NULL;
+
        /* the per-thread ones */
        for (i = 0; i < THREAD_HASH_SIZE; ++i) {
                for (info = thread_table [i]; info; info = info->next) {
+                       RememberedSet *next;
+                       int j;
                        for (remset = info->remset; remset; remset = next) {
                                DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %zd\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
                                for (p = remset->data; p < remset->store_next;) {
@@ -4628,11 +5902,28 @@ scan_from_remsets (void *start_nursery, void *end_nursery)
                                remset->next = NULL;
                                if (remset != info->remset) {
                                        DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       free_internal_mem (remset);
+                                       free_internal_mem (remset, INTERNAL_MEM_REMSET);
                                }
                        }
+                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j)
+                               handle_remset ((mword*)*info->store_remset_buffer_addr + j + 1, start_nursery, end_nursery, FALSE);
+                       clear_thread_store_remset_buffer (info);
                }
        }
+
+       /* the freed thread ones */
+       while (freed_thread_remsets) {
+               RememberedSet *next;
+               remset = freed_thread_remsets;
+               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for freed thread, range: %p-%p, size: %zd\n", remset->data, remset->store_next, remset->store_next - remset->data));
+               for (p = remset->data; p < remset->store_next;) {
+                       p = handle_remset (p, start_nursery, end_nursery, FALSE);
+               }
+               next = remset->next;
+               DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
+               free_internal_mem (remset, INTERNAL_MEM_REMSET);
+               freed_thread_remsets = next;
+       }
 }
 
 /*
@@ -4654,9 +5945,15 @@ clear_remsets (void)
                remset->next = NULL;
                if (remset != global_remset) {
                        DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                       free_internal_mem (remset);
+                       free_internal_mem (remset, INTERNAL_MEM_REMSET);
                }
        }
+       /* the generic store ones */
+       while (generic_store_remsets) {
+               GenericStoreRememberedSet *gs_next = generic_store_remsets->next;
+               free_internal_mem (generic_store_remsets, INTERNAL_MEM_STORE_REMSET);
+               generic_store_remsets = gs_next;
+       }
        /* the per-thread ones */
        for (i = 0; i < THREAD_HASH_SIZE; ++i) {
                for (info = thread_table [i]; info; info = info->next) {
@@ -4666,11 +5963,20 @@ clear_remsets (void)
                                remset->next = NULL;
                                if (remset != info->remset) {
                                        DEBUG (1, fprintf (gc_debug_file, "Freed remset at %p\n", remset->data));
-                                       free_internal_mem (remset);
+                                       free_internal_mem (remset, INTERNAL_MEM_REMSET);
                                }
                        }
+                       clear_thread_store_remset_buffer (info);
                }
        }
+
+       /* the freed thread ones */
+       while (freed_thread_remsets) {
+               next = freed_thread_remsets->next;
+               DEBUG (4, fprintf (gc_debug_file, "Freed remset at %p\n", freed_thread_remsets->data));
+               free_internal_mem (freed_thread_remsets, INTERNAL_MEM_REMSET);
+               freed_thread_remsets = next;
+       }
 }
 
 /*
@@ -4693,44 +5999,45 @@ clear_tlabs (void)
        }
 }
 
-/*
- * Find the tlab_next value of the TLAB which contains ADDR.
- */
-static char*
-find_tlab_next_from_address (char *addr)
-{
-       SgenThreadInfo *info;
-       int i;
-
-       for (i = 0; i < THREAD_HASH_SIZE; ++i) {
-               for (info = thread_table [i]; info; info = info->next) {
-                       if (addr >= *info->tlab_start_addr && addr < *info->tlab_next_addr)
-                               return *info->tlab_next_addr;
-               }
-       }
-
-       return NULL;
-}
-
 /* LOCKING: assumes the GC lock is held */
 static SgenThreadInfo*
 gc_register_current_thread (void *addr)
 {
        int hash;
        SgenThreadInfo* info = malloc (sizeof (SgenThreadInfo));
+#ifndef HAVE_KW_THREAD
+       SgenThreadInfo *__thread_info__ = info;
+#endif
+
        if (!info)
                return NULL;
+
+#ifndef HAVE_KW_THREAD
+       info->tlab_start = info->tlab_next = info->tlab_temp_end = info->tlab_real_end = NULL;
+
+       g_assert (!pthread_getspecific (thread_info_key));
+       pthread_setspecific (thread_info_key, info);
+#endif
+
        info->id = ARCH_GET_THREAD ();
        info->stop_count = -1;
        info->skip = 0;
        info->signal = 0;
        info->stack_start = NULL;
-       info->tlab_start_addr = &tlab_start;
-       info->tlab_next_addr = &tlab_next;
-       info->tlab_temp_end_addr = &tlab_temp_end;
-       info->tlab_real_end_addr = &tlab_real_end;
-
+       info->tlab_start_addr = &TLAB_START;
+       info->tlab_next_addr = &TLAB_NEXT;
+       info->tlab_temp_end_addr = &TLAB_TEMP_END;
+       info->tlab_real_end_addr = &TLAB_REAL_END;
+       info->store_remset_buffer_addr = &STORE_REMSET_BUFFER;
+       info->store_remset_buffer_index_addr = &STORE_REMSET_BUFFER_INDEX;
+       info->stopped_ip = NULL;
+       info->stopped_domain = NULL;
+       info->stopped_regs = NULL;
+
+#ifdef HAVE_KW_THREAD
        tlab_next_addr = &tlab_next;
+       store_remset_buffer_index_addr = &store_remset_buffer_index;
+#endif
 
        /* try to get it with attributes first */
 #if defined(HAVE_PTHREAD_GETATTR_NP) && defined(HAVE_PTHREAD_ATTR_GETSTACK)
@@ -4740,6 +6047,7 @@ gc_register_current_thread (void *addr)
                pthread_attr_t attr;
                pthread_getattr_np (pthread_self (), &attr);
                pthread_attr_getstack (&attr, &sstart, &size);
+               info->stack_start_limit = sstart;
                info->stack_end = (char*)sstart + size;
                pthread_attr_destroy (&attr);
        }
@@ -4755,13 +6063,24 @@ gc_register_current_thread (void *addr)
        }
 #endif
 
-       /* hash into the table */
+#ifdef HAVE_KW_THREAD
+       stack_end = info->stack_end;
+#endif
+
+       /* hash into the table */
        hash = HASH_PTHREAD_T (info->id) % THREAD_HASH_SIZE;
        info->next = thread_table [hash];
        thread_table [hash] = info;
 
-       remembered_set = info->remset = alloc_remset (DEFAULT_REMSET_SIZE, info);
-       pthread_setspecific (remembered_set_key, remembered_set);
+       info->remset = alloc_remset (DEFAULT_REMSET_SIZE, info);
+       pthread_setspecific (remembered_set_key, info->remset);
+#ifdef HAVE_KW_THREAD
+       remembered_set = info->remset;
+#endif
+
+       STORE_REMSET_BUFFER = get_internal_mem (sizeof (gpointer) * STORE_REMSET_BUFFER_SIZE, INTERNAL_MEM_STORE_REMSET);
+       STORE_REMSET_BUFFER_INDEX = 0;
+
        DEBUG (3, fprintf (gc_debug_file, "registered thread %p (%p) (hash: %d)\n", info, (gpointer)info->id, hash));
 
        if (gc_callbacks.thread_attach_func)
@@ -4770,6 +6089,15 @@ gc_register_current_thread (void *addr)
        return info;
 }
 
+static void
+add_generic_store_remset_from_buffer (gpointer *buffer)
+{
+       GenericStoreRememberedSet *remset = get_internal_mem (sizeof (GenericStoreRememberedSet), INTERNAL_MEM_STORE_REMSET);
+       memcpy (remset->data, buffer + 1, sizeof (gpointer) * (STORE_REMSET_BUFFER_SIZE - 1));
+       remset->next = generic_store_remsets;
+       generic_store_remsets = remset;
+}
+
 static void
 unregister_current_thread (void)
 {
@@ -4792,19 +6120,26 @@ unregister_current_thread (void)
        } else {
                prev->next = p->next;
        }
-       rset = p->remset;
-       /* FIXME: transfer remsets if any */
-       while (rset) {
-               RememberedSet *next = rset->next;
-               free_internal_mem (rset);
-               rset = next;
+       if (p->remset) {
+               if (freed_thread_remsets) {
+                       for (rset = p->remset; rset->next; rset = rset->next)
+                               ;
+                       rset->next = freed_thread_remsets;
+                       freed_thread_remsets = p->remset;
+               } else {
+                       freed_thread_remsets = p->remset;
+               }
        }
+       if (*p->store_remset_buffer_index_addr)
+               add_generic_store_remset_from_buffer (*p->store_remset_buffer_addr);
+       free_internal_mem (*p->store_remset_buffer_addr, INTERNAL_MEM_STORE_REMSET);
        free (p);
 }
 
 static void
 unregister_thread (void *k)
 {
+       g_assert (!mono_domain_get ());
        LOCK_GC;
        unregister_current_thread ();
        UNLOCK_GC;
@@ -4814,7 +6149,9 @@ gboolean
 mono_gc_register_thread (void *baseptr)
 {
        SgenThreadInfo *info;
+
        LOCK_GC;
+       init_stats ();
        info = thread_info_lookup (ARCH_GET_THREAD ());
        if (info == NULL)
                info = gc_register_current_thread (baseptr);
@@ -4832,7 +6169,7 @@ typedef struct {
        void *(*start_routine) (void *);
        void *arg;
        int flags;
-       sem_t registered;
+       MonoSemType registered;
 } SgenThreadStartInfo;
 
 static void*
@@ -4843,12 +6180,15 @@ gc_start_thread (void *arg)
        void *t_arg = start_info->arg;
        void *(*start_func) (void*) = start_info->start_routine;
        void *result;
+       int post_result;
 
        LOCK_GC;
        info = gc_register_current_thread (&result);
        UNLOCK_GC;
-       sem_post (&(start_info->registered));
+       post_result = MONO_SEM_POST (&(start_info->registered));
+       g_assert (!post_result);
        result = start_func (t_arg);
+       g_assert (!mono_domain_get ());
        /*
         * this is done by the pthread key dtor
        LOCK_GC;
@@ -4868,17 +6208,18 @@ mono_gc_pthread_create (pthread_t *new_thread, const pthread_attr_t *attr, void
        start_info = malloc (sizeof (SgenThreadStartInfo));
        if (!start_info)
                return ENOMEM;
-       sem_init (&(start_info->registered), 0, 0);
+       result = MONO_SEM_INIT (&(start_info->registered), 0);
+       g_assert (!result);
        start_info->arg = arg;
        start_info->start_routine = start_routine;
 
        result = pthread_create (new_thread, attr, gc_start_thread, start_info);
        if (result == 0) {
-               while (sem_wait (&(start_info->registered)) != 0) {
+               while (MONO_SEM_WAIT (&(start_info->registered)) != 0) {
                        /*if (EINTR != errno) ABORT("sem_wait failed"); */
                }
        }
-       sem_destroy (&(start_info->registered));
+       MONO_SEM_DESTROY (&(start_info->registered));
        free (start_info);
        return result;
 }
@@ -4905,7 +6246,7 @@ mono_gc_pthread_detach (pthread_t thread)
 
 static RememberedSet*
 alloc_remset (int size, gpointer id) {
-       RememberedSet* res = get_internal_mem (sizeof (RememberedSet) + (size * sizeof (gpointer)));
+       RememberedSet* res = get_internal_mem (sizeof (RememberedSet) + (size * sizeof (gpointer)), INTERNAL_MEM_REMSET);
        res->store_next = res->data;
        res->end_set = res->data + size;
        res->next = NULL;
@@ -4925,100 +6266,242 @@ void
 mono_gc_wbarrier_set_field (MonoObject *obj, gpointer field_ptr, MonoObject* value)
 {
        RememberedSet *rs;
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_set_field);
        if (ptr_in_nursery (field_ptr)) {
                *(void**)field_ptr = value;
                return;
        }
        DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p\n", field_ptr));
-       rs = remembered_set;
+       LOCK_GC;
+       rs = REMEMBERED_SET;
        if (rs->store_next < rs->end_set) {
                *(rs->store_next++) = (mword)field_ptr;
                *(void**)field_ptr = value;
+               UNLOCK_GC;
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+#endif
        *(rs->store_next++) = (mword)field_ptr;
        *(void**)field_ptr = value;
+       UNLOCK_GC;
 }
 
 void
 mono_gc_wbarrier_set_arrayref (MonoArray *arr, gpointer slot_ptr, MonoObject* value)
 {
-       RememberedSet *rs = remembered_set;
+       RememberedSet *rs;
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_set_arrayref);
        if (ptr_in_nursery (slot_ptr)) {
                *(void**)slot_ptr = value;
                return;
        }
        DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p\n", slot_ptr));
+       LOCK_GC;
+       rs = REMEMBERED_SET;
        if (rs->store_next < rs->end_set) {
                *(rs->store_next++) = (mword)slot_ptr;
                *(void**)slot_ptr = value;
+               UNLOCK_GC;
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+#endif
        *(rs->store_next++) = (mword)slot_ptr;
        *(void**)slot_ptr = value;
+       UNLOCK_GC;
 }
 
 void
-mono_gc_wbarrier_arrayref_copy (MonoArray *arr, gpointer slot_ptr, int count)
+mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
 {
-       RememberedSet *rs = remembered_set;
-       if (ptr_in_nursery (slot_ptr))
+       RememberedSet *rs;
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_arrayref_copy);
+       LOCK_GC;
+       memmove (dest_ptr, src_ptr, count * sizeof (gpointer));
+       if (ptr_in_nursery (dest_ptr)) {
+               UNLOCK_GC;
                return;
-       DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p, %d\n", slot_ptr, count));
+       }
+       rs = REMEMBERED_SET;
+       DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p, %d\n", dest_ptr, count));
        if (rs->store_next + 1 < rs->end_set) {
-               *(rs->store_next++) = (mword)slot_ptr | REMSET_RANGE;
+               *(rs->store_next++) = (mword)dest_ptr | REMSET_RANGE;
                *(rs->store_next++) = count;
+               UNLOCK_GC;
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
-       *(rs->store_next++) = (mword)slot_ptr | REMSET_RANGE;
+#endif
+       *(rs->store_next++) = (mword)dest_ptr | REMSET_RANGE;
        *(rs->store_next++) = count;
+       UNLOCK_GC;
+}
+
+static char*
+find_object_for_ptr_in_area (char *ptr, char *start, char *end)
+{
+       while (start < end) {
+               char *old_start;
+
+               if (!*(void**)start) {
+                       start += sizeof (void*); /* should be ALLOC_ALIGN, really */
+                       continue;
+               }
+
+               old_start = start;
+
+               #define SCAN_OBJECT_NOSCAN
+               #include "sgen-scan-object.h"
+
+               if (ptr >= old_start && ptr < start)
+                       return old_start;
+       }
+
+       return NULL;
+}
+
+static char *found_obj;
+
+static void
+find_object_for_ptr_in_pinned_chunk_callback (PinnedChunk *chunk, char *obj, size_t size, char *ptr)
+{
+       if (ptr >= obj && ptr < obj + size) {
+               g_assert (!found_obj);
+               found_obj = obj;
+       }
+}
+
+/* for use in the debugger */
+char* find_object_for_ptr (char *ptr);
+char*
+find_object_for_ptr (char *ptr)
+{
+       GCMemSection *section;
+       LOSObject *bigobj;
+
+       for (section = section_list; section; section = section->block.next) {
+               if (ptr >= section->data && ptr < section->end_data)
+                       return find_object_for_ptr_in_area (ptr, section->data, section->end_data);
+       }
+
+       for (bigobj = los_object_list; bigobj; bigobj = bigobj->next) {
+               if (ptr >= bigobj->data && ptr < bigobj->data + bigobj->size)
+                       return bigobj->data;
+       }
+
+       found_obj = NULL;
+       scan_pinned_objects ((ScanPinnedObjectCallbackFunc)find_object_for_ptr_in_pinned_chunk_callback, ptr);
+       return found_obj;
+}
+
+static void
+evacuate_remset_buffer (void)
+{
+       gpointer *buffer;
+       TLAB_ACCESS_INIT;
+
+       buffer = STORE_REMSET_BUFFER;
+
+       add_generic_store_remset_from_buffer (buffer);
+       memset (buffer, 0, sizeof (gpointer) * STORE_REMSET_BUFFER_SIZE);
+
+       STORE_REMSET_BUFFER_INDEX = 0;
 }
 
 void
-mono_gc_wbarrier_generic_store (gpointer ptr, MonoObject* value)
+mono_gc_wbarrier_generic_nostore (gpointer ptr)
 {
-       RememberedSet *rs;
-       if (ptr_in_nursery (ptr)) {
+       gpointer *buffer;
+       int index;
+       TLAB_ACCESS_INIT;
+
+       HEAVY_STAT (++stat_wbarrier_generic_store);
+
+#ifdef XDOMAIN_CHECKS_IN_WBARRIER
+       /* FIXME: ptr_in_heap must be called with the GC lock held */
+       if (xdomain_checks && *(MonoObject**)ptr && ptr_in_heap (ptr)) {
+               char *start = find_object_for_ptr (ptr);
+               MonoObject *value = *(MonoObject**)ptr;
+               LOCK_GC;
+               g_assert (start);
+               if (start) {
+                       MonoObject *obj = (MonoObject*)start;
+                       if (obj->vtable->domain != value->vtable->domain)
+                               g_assert (is_xdomain_ref_allowed (ptr, start, obj->vtable->domain));
+               }
+               UNLOCK_GC;
+       }
+#endif
+
+       LOCK_GC;
+       if (ptr_in_nursery (ptr) || ptr_on_stack (ptr) || !ptr_in_nursery (*(gpointer*)ptr)) {
                DEBUG (8, fprintf (gc_debug_file, "Skipping remset at %p\n", ptr));
-               *(void**)ptr = value;
+               UNLOCK_GC;
                return;
        }
-       rs = remembered_set;
-       DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p (%s)\n", ptr, value ? safe_name (value) : "null"));
-       /* FIXME: ensure it is on the heap */
-       if (rs->store_next < rs->end_set) {
-               *(rs->store_next++) = (mword)ptr;
-               *(void**)ptr = value;
+
+       buffer = STORE_REMSET_BUFFER;
+       index = STORE_REMSET_BUFFER_INDEX;
+       /* This simple optimization eliminates a sizable portion of
+          entries.  Comparing it to the last but one entry as well
+          doesn't eliminate significantly more entries. */
+       if (buffer [index] == ptr) {
+               UNLOCK_GC;
                return;
        }
-       rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
-       thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
-       *(rs->store_next++) = (mword)ptr;
+
+       DEBUG (8, fprintf (gc_debug_file, "Adding remset at %p\n", ptr));
+       HEAVY_STAT (++stat_wbarrier_generic_store_remset);
+
+       ++index;
+       if (index >= STORE_REMSET_BUFFER_SIZE) {
+               evacuate_remset_buffer ();
+               index = STORE_REMSET_BUFFER_INDEX;
+               g_assert (index == 0);
+               ++index;
+       }
+       buffer [index] = ptr;
+       STORE_REMSET_BUFFER_INDEX = index;
+
+       UNLOCK_GC;
+}
+
+void
+mono_gc_wbarrier_generic_store (gpointer ptr, MonoObject* value)
+{
+       DEBUG (8, fprintf (gc_debug_file, "Wbarrier store at %p to %p (%s)\n", ptr, value, value ? safe_name (value) : "null"));
        *(void**)ptr = value;
+       if (ptr_in_nursery (value))
+               mono_gc_wbarrier_generic_nostore (ptr);
 }
 
 void
 mono_gc_wbarrier_set_root (gpointer ptr, MonoObject *value)
 {
-       RememberedSet *rs = remembered_set;
+       RememberedSet *rs;
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_set_root);
        if (ptr_in_nursery (ptr))
                return;
        DEBUG (8, fprintf (gc_debug_file, "Adding root remset at %p (%s)\n", ptr, value ? safe_name (value) : "null"));
 
+       rs = REMEMBERED_SET;
        if (rs->store_next + 2 < rs->end_set) {
                *(rs->store_next++) = (mword)ptr | REMSET_OTHER;
                *(rs->store_next++) = (mword)REMSET_ROOT_LOCATION;
@@ -5026,9 +6509,11 @@ mono_gc_wbarrier_set_root (gpointer ptr, MonoObject *value)
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+#endif
        *(rs->store_next++) = (mword)ptr | REMSET_OTHER;
        *(rs->store_next++) = (mword)REMSET_ROOT_LOCATION;
 
@@ -5038,45 +6523,78 @@ mono_gc_wbarrier_set_root (gpointer ptr, MonoObject *value)
 void
 mono_gc_wbarrier_value_copy (gpointer dest, gpointer src, int count, MonoClass *klass)
 {
-       RememberedSet *rs = remembered_set;
-       if (ptr_in_nursery (dest))
+       RememberedSet *rs;
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_value_copy);
+       g_assert (klass->valuetype);
+       LOCK_GC;
+       memmove (dest, src, count * mono_class_value_size (klass, NULL));
+       rs = REMEMBERED_SET;
+       if (ptr_in_nursery (dest) || ptr_on_stack (dest)) {
+               UNLOCK_GC;
                return;
-       DEBUG (8, fprintf (gc_debug_file, "Adding value remset at %p, count %d for class %s\n", dest, count, klass->name));
+       }
+       g_assert (klass->gc_descr_inited);
+       DEBUG (8, fprintf (gc_debug_file, "Adding value remset at %p, count %d, descr %p for class %s (%p)\n", dest, count, klass->gc_descr, klass->name, klass));
 
-       if (rs->store_next + 2 < rs->end_set) {
+       if (rs->store_next + 3 < rs->end_set) {
                *(rs->store_next++) = (mword)dest | REMSET_OTHER;
                *(rs->store_next++) = (mword)REMSET_VTYPE;
                *(rs->store_next++) = (mword)klass->gc_descr;
+               *(rs->store_next++) = (mword)count;
+               UNLOCK_GC;
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+#endif
        *(rs->store_next++) = (mword)dest | REMSET_OTHER;
        *(rs->store_next++) = (mword)REMSET_VTYPE;
        *(rs->store_next++) = (mword)klass->gc_descr;
+       *(rs->store_next++) = (mword)count;
+       UNLOCK_GC;
 }
 
 /**
- * mono_gc_wbarrier_object:
+ * mono_gc_wbarrier_object_copy:
  *
  * Write barrier to call when obj is the result of a clone or copy of an object.
  */
 void
-mono_gc_wbarrier_object (MonoObject* obj)
+mono_gc_wbarrier_object_copy (MonoObject* obj, MonoObject *src)
 {
-       RememberedSet *rs = remembered_set;
+       RememberedSet *rs;
+       int size;
+
+       TLAB_ACCESS_INIT;
+       HEAVY_STAT (++stat_wbarrier_object_copy);
+       rs = REMEMBERED_SET;
        DEBUG (1, fprintf (gc_debug_file, "Adding object remset for %p\n", obj));
+       size = mono_object_class (obj)->instance_size;
+       LOCK_GC;
+       /* do not copy the sync state */
+       memcpy ((char*)obj + sizeof (MonoObject), (char*)src + sizeof (MonoObject),
+                       size - sizeof (MonoObject));
+       if (ptr_in_nursery (obj) || ptr_on_stack (obj)) {
+               UNLOCK_GC;
+               return;
+       }
        if (rs->store_next < rs->end_set) {
                *(rs->store_next++) = (mword)obj | REMSET_OBJECT;
+               UNLOCK_GC;
                return;
        }
        rs = alloc_remset (rs->end_set - rs->data, (void*)1);
-       rs->next = remembered_set;
-       remembered_set = rs;
+       rs->next = REMEMBERED_SET;
+       REMEMBERED_SET = rs;
+#ifdef HAVE_KW_THREAD
        thread_info_lookup (ARCH_GET_THREAD ())->remset = rs;
+#endif
        *(rs->store_next++) = (mword)obj | REMSET_OBJECT;
+       UNLOCK_GC;
 }
 
 /*
@@ -5110,7 +6628,7 @@ describe_ptr (char *ptr)
                for (section = section_list; section;) {
                        if (ptr >= section->data && ptr < section->data + section->size)
                                break;
-                       section = section->next;
+                       section = section->block.next;
                }
 
                if (section) {
@@ -5144,7 +6662,7 @@ describe_ptr (char *ptr)
        printf ("Class: %s\n", vtable->klass->name);
 
        desc = ((GCVTable*)vtable)->desc;
-       printf ("Descriptor: %lx\n", desc);
+       printf ("Descriptor: %lx\n", (long)desc);
 
        type = desc & 0x7;
        printf ("Descriptor type: %d (%s)\n", type, descriptor_types [type]);
@@ -5182,12 +6700,14 @@ find_in_remset_loc (mword *p, char *addr, gboolean *found)
                case REMSET_VTYPE:
                        ptr = (void**)(*p & ~REMSET_TYPE_MASK);
                        desc = p [2];
+                       count = p [3];
 
                        switch (desc & 0x7) {
                        case DESC_TYPE_RUN_LENGTH:
                                OBJ_RUN_LEN_SIZE (skip_size, desc, ptr);
                                /* The descriptor includes the size of MonoObject */
                                skip_size -= sizeof (MonoObject);
+                               skip_size *= count;
                                if ((void**)addr >= ptr && (void**)addr < ptr + (skip_size / sizeof (gpointer)))
                                        *found = TRUE;
                                break;
@@ -5196,7 +6716,7 @@ find_in_remset_loc (mword *p, char *addr, gboolean *found)
                                g_assert_not_reached ();
                        }
 
-                       return p + 3;
+                       return p + 4;
                case REMSET_ROOT_LOCATION:
                        return p + 2;
                default:
@@ -5219,6 +6739,7 @@ find_in_remsets (char *addr)
        int i;
        SgenThreadInfo *info;
        RememberedSet *remset;
+       GenericStoreRememberedSet *store_remset;
        mword *p;
        gboolean found = FALSE;
 
@@ -5231,9 +6752,19 @@ find_in_remsets (char *addr)
                                return TRUE;
                }
        }
+
+       /* the generic store ones */
+       for (store_remset = generic_store_remsets; store_remset; store_remset = store_remset->next) {
+               for (i = 0; i < STORE_REMSET_BUFFER_SIZE - 1; ++i) {
+                       if (store_remset->data [i] == addr)
+                               return TRUE;
+               }
+       }
+
        /* the per-thread ones */
        for (i = 0; i < THREAD_HASH_SIZE; ++i) {
                for (info = thread_table [i]; info; info = info->next) {
+                       int j;
                        for (remset = info->remset; remset; remset = remset->next) {
                                DEBUG (4, fprintf (gc_debug_file, "Scanning remset for thread %p, range: %p-%p, size: %zd\n", info, remset->data, remset->store_next, remset->store_next - remset->data));
                                for (p = remset->data; p < remset->store_next;) {
@@ -5242,6 +6773,20 @@ find_in_remsets (char *addr)
                                                return TRUE;
                                }
                        }
+                       for (j = 0; j < *info->store_remset_buffer_index_addr; ++j) {
+                               if ((*info->store_remset_buffer_addr) [j + 1] == addr)
+                                       return TRUE;
+                       }
+               }
+       }
+
+       /* the freed thread ones */
+       for (remset = freed_thread_remsets; remset; remset = remset->next) {
+               DEBUG (4, fprintf (gc_debug_file, "Scanning remset for freed thread, range: %p-%p, size: %zd\n", remset->data, remset->store_next, remset->store_next - remset->data));
+               for (p = remset->data; p < remset->store_next;) {
+                       p = find_in_remset_loc (p, addr, &found);
+                       if (found)
+                               return TRUE;
                }
        }
 
@@ -5266,12 +6811,7 @@ static void __attribute__((noinline))
 check_remsets_for_area (char *start, char *end)
 {
        GCVTable *vt;
-       size_t skip_size;
-       int type;
        int type_str = 0, type_rlen = 0, type_bitmap = 0, type_vector = 0, type_lbit = 0, type_complex = 0;
-       mword desc;
-       new_obj_references = 0;
-       obj_references_checked = 0;
        while (start < end) {
                if (!*(void**)start) {
                        start += sizeof (void*); /* should be ALLOC_ALIGN, really */
@@ -5283,74 +6823,9 @@ check_remsets_for_area (char *start, char *end)
                        MonoObject *obj = (MonoObject*)start;
                        g_print ("found at %p (0x%lx): %s.%s\n", start, (long)vt->desc, obj->vtable->klass->name_space, obj->vtable->klass->name);
                }
-               desc = vt->desc;
-               type = desc & 0x7;
-               if (type == DESC_TYPE_STRING) {
-                       STRING_SIZE (skip_size, start);
-                       start += skip_size;
-                       type_str++;
-                       continue;
-               } else if (type == DESC_TYPE_RUN_LENGTH) {
-                       OBJ_RUN_LEN_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       OBJ_RUN_LEN_FOREACH_PTR (desc,start);
-                       start += skip_size;
-                       type_rlen++;
-                       continue;
-               } else if (type == DESC_TYPE_VECTOR) { // includes ARRAY, too
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_VECTOR_FOREACH_PTR (vt, start);
-                       if (((MonoArray*)start)->bounds) {
-                               /* account for the bounds */
-                               skip_size += sizeof (MonoArrayBounds) * vt->klass->rank;
-                       }
-                       start += skip_size;
-                       type_vector++;
-                       continue;
-               } else if (type == DESC_TYPE_SMALL_BITMAP) {
-                       OBJ_BITMAP_SIZE (skip_size, desc, start);
-                       g_assert (skip_size);
-                       OBJ_BITMAP_FOREACH_PTR (desc,start);
-                       start += skip_size;
-                       type_bitmap++;
-                       continue;
-               } else if (type == DESC_TYPE_LARGE_BITMAP) {
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_LARGE_BITMAP_FOREACH_PTR (vt,start);
-                       start += skip_size;
-                       type_lbit++;
-                       continue;
-               } else if (type == DESC_TYPE_COMPLEX) {
-                       /* this is a complex object */
-                       skip_size = safe_object_get_size ((MonoObject*)start);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_COMPLEX_FOREACH_PTR (vt, start);
-                       start += skip_size;
-                       type_complex++;
-                       continue;
-               } else if (type == DESC_TYPE_COMPLEX_ARR) {
-                       /* this is an array of complex structs */
-                       skip_size = mono_array_element_size (((MonoVTable*)vt)->klass);
-                       skip_size *= mono_array_length ((MonoArray*)start);
-                       skip_size += sizeof (MonoArray);
-                       skip_size += (ALLOC_ALIGN - 1);
-                       skip_size &= ~(ALLOC_ALIGN - 1);
-                       OBJ_COMPLEX_ARR_FOREACH_PTR (vt, start);
-                       if (((MonoArray*)start)->bounds) {
-                               /* account for the bounds */
-                               skip_size += sizeof (MonoArrayBounds) * vt->klass->rank;
-                       }
-                       start += skip_size;
-                       type_complex++;
-                       continue;
-               } else {
-                       g_assert (0);
-               }
+
+#define SCAN_OBJECT_ACTION COUNT_OBJECT_TYPES
+#include "sgen-scan-object.h"
        }
 }
 
@@ -5370,8 +6845,8 @@ check_consistency (void)
        DEBUG (1, fprintf (gc_debug_file, "Begin heap consistency check...\n"));
 
        // Check that oldspace->newspace pointers are registered with the collector
-       for (section = section_list; section; section = section->next) {
-               if (section->role == MEMORY_ROLE_GEN0)
+       for (section = section_list; section; section = section->block.next) {
+               if (section->block.role == MEMORY_ROLE_GEN0)
                        continue;
                DEBUG (2, fprintf (gc_debug_file, "Scan of old section: %p-%p, size: %d\n", section->data, section->next_data, (int)(section->next_data - section->data)));
                check_remsets_for_area (section->data, section->next_data);
@@ -5397,60 +6872,12 @@ check_consistency (void)
 char*
 check_object (char *start)
 {
-       GCVTable *vt;
-       size_t skip_size;
-       mword desc;
-
        if (!start)
                return NULL;
 
-       vt = (GCVTable*)LOAD_VTABLE (start);
-       //type = vt->desc & 0x7;
+#include "sgen-scan-object.h"
 
-       desc = vt->desc;
-       switch (desc & 0x7) {
-       case DESC_TYPE_STRING:
-               STRING_SIZE (skip_size, start);
-               return start + skip_size;
-       case DESC_TYPE_RUN_LENGTH:
-               OBJ_RUN_LEN_FOREACH_PTR (desc,start);
-               OBJ_RUN_LEN_SIZE (skip_size, desc, start);
-               g_assert (skip_size);
-               return start + skip_size;
-       case DESC_TYPE_ARRAY:
-       case DESC_TYPE_VECTOR:
-               OBJ_VECTOR_FOREACH_PTR (vt, start);
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       case DESC_TYPE_SMALL_BITMAP:
-               OBJ_BITMAP_FOREACH_PTR (desc,start);
-               OBJ_BITMAP_SIZE (skip_size, desc, start);
-               return start + skip_size;
-       case DESC_TYPE_LARGE_BITMAP:
-               OBJ_LARGE_BITMAP_FOREACH_PTR (vt,start);
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       case DESC_TYPE_COMPLEX:
-               OBJ_COMPLEX_FOREACH_PTR (vt, start);
-               /* this is a complex object */
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       case DESC_TYPE_COMPLEX_ARR:
-               OBJ_COMPLEX_ARR_FOREACH_PTR (vt, start);
-               /* this is an array of complex structs */
-               skip_size = safe_object_get_size ((MonoObject*)start);
-               skip_size += (ALLOC_ALIGN - 1);
-               skip_size &= ~(ALLOC_ALIGN - 1);
-               return start + skip_size;
-       }
-       g_assert_not_reached ();
-       return NULL;
+       return start;
 }
 
 /*
@@ -5463,12 +6890,11 @@ void
 mono_gc_collect (int generation)
 {
        LOCK_GC;
-       update_current_thread_stack (&generation);
        stop_world ();
        if (generation == 0) {
                collect_nursery (0);
        } else {
-               major_collection ();
+               major_collection ("user request");
        }
        restart_world ();
        UNLOCK_GC;
@@ -5495,7 +6921,7 @@ mono_gc_get_used_size (void)
        GCMemSection *section;
        LOCK_GC;
        tot = los_memory_usage;
-       for (section = section_list; section; section = section->next) {
+       for (section = section_list; section; section = section->block.next) {
                /* this is approximate... */
                tot += section->next_data - section->data;
        }
@@ -5526,6 +6952,12 @@ mono_gc_enable (void)
        UNLOCK_GC;
 }
 
+int
+mono_gc_get_los_limit (void)
+{
+       return MAX_SMALL_OBJ_SIZE;
+}
+
 gboolean
 mono_object_is_alive (MonoObject* o)
 {
@@ -5548,13 +6980,17 @@ mono_gc_enable_events (void)
 void
 mono_gc_weak_link_add (void **link_addr, MonoObject *obj, gboolean track)
 {
+       LOCK_GC;
        mono_gc_register_disappearing_link (obj, link_addr, track);
+       UNLOCK_GC;
 }
 
 void
 mono_gc_weak_link_remove (void **link_addr)
 {
+       LOCK_GC;
        mono_gc_register_disappearing_link (NULL, link_addr, FALSE);
+       UNLOCK_GC;
 }
 
 MonoObject*
@@ -5654,19 +7090,30 @@ mono_gc_base_init (void)
                                collect_before_allocs = TRUE;
                        } else if (!strcmp (opt, "check-at-minor-collections")) {
                                consistency_check_at_minor_collection = TRUE;
+                       } else if (!strcmp (opt, "xdomain-checks")) {
+                               xdomain_checks = TRUE;
                        } else if (!strcmp (opt, "clear-at-gc")) {
                                nursery_clear_policy = CLEAR_AT_GC;
+                       } else if (!strcmp (opt, "conservative-stack-mark")) {
+                               conservative_stack_mark = TRUE;
+                       } else if (g_str_has_prefix (opt, "heap-dump=")) {
+                               char *filename = strchr (opt, '=') + 1;
+                               nursery_clear_policy = CLEAR_AT_GC;
+                               heap_dump_file = fopen (filename, "w");
+                               if (heap_dump_file)
+                                       fprintf (heap_dump_file, "<sgen-dump>\n");
                        } else {
                                fprintf (stderr, "Invalid format for the MONO_GC_DEBUG env variable: '%s'\n", env);
                                fprintf (stderr, "The format is: MONO_GC_DEBUG=[l[:filename]|<option>]+ where l is a debug level 0-9.\n");
-                               fprintf (stderr, "Valid options are: collect-before-allocs, check-at-minor-collections, clear-at-gc.\n");
+                               fprintf (stderr, "Valid options are: collect-before-allocs, check-at-minor-collections, xdomain-checks, clear-at-gc.\n");
                                exit (1);
                        }
                }
                g_strfreev (opts);
        }
 
-       sem_init (&suspend_ack_semaphore, 0, 0);
+       suspend_ack_semaphore_ptr = &suspend_ack_semaphore;
+       MONO_SEM_INIT (&suspend_ack_semaphore, 0);
 
        sigfillset (&sinfo.sa_mask);
        sinfo.sa_flags = SA_RESTART | SA_SIGINFO;
@@ -5687,61 +7134,177 @@ mono_gc_base_init (void)
        global_remset->next = NULL;
 
        pthread_key_create (&remembered_set_key, unregister_thread);
+
+#ifndef HAVE_KW_THREAD
+       pthread_key_create (&thread_info_key, NULL);
+#endif
+
        gc_initialized = TRUE;
        UNLOCK_GC;
        mono_gc_register_thread (&sinfo);
 }
 
+int
+mono_gc_get_suspend_signal (void)
+{
+       return suspend_signal_num;
+}
+
 enum {
        ATYPE_NORMAL,
+       ATYPE_VECTOR,
+       ATYPE_SMALL,
        ATYPE_NUM
 };
 
+#ifdef HAVE_KW_THREAD
+#define EMIT_TLS_ACCESS(mb,dummy,offset)       do {    \
+       mono_mb_emit_byte ((mb), MONO_CUSTOM_PREFIX);   \
+       mono_mb_emit_byte ((mb), CEE_MONO_TLS);         \
+       mono_mb_emit_i4 ((mb), (offset));               \
+       } while (0)
+#else
+#define EMIT_TLS_ACCESS(mb,member,dummy)       do {    \
+       mono_mb_emit_byte ((mb), MONO_CUSTOM_PREFIX);   \
+       mono_mb_emit_byte ((mb), CEE_MONO_TLS);         \
+       mono_mb_emit_i4 ((mb), thread_info_key);        \
+       mono_mb_emit_icon ((mb), G_STRUCT_OFFSET (SgenThreadInfo, member));     \
+       mono_mb_emit_byte ((mb), CEE_ADD);              \
+       mono_mb_emit_byte ((mb), CEE_LDIND_I);          \
+       } while (0)
+#endif
+
+#ifdef MANAGED_ALLOCATION
 /* FIXME: Do this in the JIT, where specialized allocation sequences can be created
  * for each class. This is currently not easy to do, as it is hard to generate basic 
  * blocks + branches, but it is easy with the linear IL codebase.
+ *
+ * For this to work we'd need to solve the TLAB race, first.  Now we
+ * require the allocator to be in a few known methods to make sure
+ * that they are executed atomically via the restart mechanism.
  */
 static MonoMethod*
 create_allocator (int atype)
 {
-       int tlab_next_addr_offset = -1;
-       int tlab_temp_end_offset = -1;
-       int p_var, size_var, tlab_next_addr_var, new_next_var;
-       guint32 slowpath_branch;
+       int p_var, size_var;
+       guint32 slowpath_branch, max_size_branch;
        MonoMethodBuilder *mb;
        MonoMethod *res;
        MonoMethodSignature *csig;
        static gboolean registered = FALSE;
+       int tlab_next_addr_var, new_next_var;
+       int num_params, i;
+       const char *name = NULL;
+       AllocatorWrapperInfo *info;
+
+#ifdef HAVE_KW_THREAD
+       int tlab_next_addr_offset = -1;
+       int tlab_temp_end_offset = -1;
 
        MONO_THREAD_VAR_OFFSET (tlab_next_addr, tlab_next_addr_offset);
        MONO_THREAD_VAR_OFFSET (tlab_temp_end, tlab_temp_end_offset);
 
        g_assert (tlab_next_addr_offset != -1);
        g_assert (tlab_temp_end_offset != -1);
-
-       g_assert (atype == ATYPE_NORMAL);
+#endif
 
        if (!registered) {
                mono_register_jit_icall (mono_gc_alloc_obj, "mono_gc_alloc_obj", mono_create_icall_signature ("object ptr int"), FALSE);
+               mono_register_jit_icall (mono_gc_alloc_vector, "mono_gc_alloc_vector", mono_create_icall_signature ("object ptr int int"), FALSE);
                registered = TRUE;
        }
 
-       csig = mono_metadata_signature_alloc (mono_defaults.corlib, 1);
+       if (atype == ATYPE_SMALL) {
+               num_params = 1;
+               name = "AllocSmall";
+       } else if (atype == ATYPE_NORMAL) {
+               num_params = 1;
+               name = "Alloc";
+       } else if (atype == ATYPE_VECTOR) {
+               num_params = 2;
+               name = "AllocVector";
+       } else {
+               g_assert_not_reached ();
+       }
+
+       csig = mono_metadata_signature_alloc (mono_defaults.corlib, num_params);
        csig->ret = &mono_defaults.object_class->byval_arg;
-       csig->params [0] = &mono_defaults.int_class->byval_arg;
+       for (i = 0; i < num_params; ++i)
+               csig->params [i] = &mono_defaults.int_class->byval_arg;
 
-       mb = mono_mb_new (mono_defaults.object_class, "Alloc", MONO_WRAPPER_ALLOC);
+       mb = mono_mb_new (mono_defaults.object_class, name, MONO_WRAPPER_ALLOC);
        size_var = mono_mb_add_local (mb, &mono_defaults.int32_class->byval_arg);
-       /* size = vtable->klass->instance_size; */
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoVTable, klass));
-       mono_mb_emit_byte (mb, CEE_ADD);
-       mono_mb_emit_byte (mb, CEE_LDIND_I);
-       mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoClass, instance_size));
-       mono_mb_emit_byte (mb, CEE_ADD);
-       /* FIXME: assert instance_size stays a 4 byte integer */
-       mono_mb_emit_byte (mb, CEE_LDIND_U4);
-       mono_mb_emit_stloc (mb, size_var);
+       if (atype == ATYPE_NORMAL || atype == ATYPE_SMALL) {
+               /* size = vtable->klass->instance_size; */
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoVTable, klass));
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_byte (mb, CEE_LDIND_I);
+               mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoClass, instance_size));
+               mono_mb_emit_byte (mb, CEE_ADD);
+               /* FIXME: assert instance_size stays a 4 byte integer */
+               mono_mb_emit_byte (mb, CEE_LDIND_U4);
+               mono_mb_emit_stloc (mb, size_var);
+       } else if (atype == ATYPE_VECTOR) {
+               MonoExceptionClause *clause;
+               int pos, pos_leave;
+               MonoClass *oom_exc_class;
+               MonoMethod *ctor;
+
+               /* n >  MONO_ARRAY_MAX_INDEX -> OverflowException */
+               mono_mb_emit_ldarg (mb, 1);
+               mono_mb_emit_icon (mb, MONO_ARRAY_MAX_INDEX);
+               pos = mono_mb_emit_short_branch (mb, CEE_BLE_UN_S);
+               mono_mb_emit_exception (mb, "OverflowException", NULL);
+               mono_mb_patch_short_branch (mb, pos);
+
+               clause = mono_image_alloc0 (mono_defaults.corlib, sizeof (MonoExceptionClause));
+               clause->try_offset = mono_mb_get_label (mb);
+
+               /* vtable->klass->sizes.element_size */
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoVTable, klass));
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_byte (mb, CEE_LDIND_I);
+               mono_mb_emit_icon (mb, G_STRUCT_OFFSET (MonoClass, sizes.element_size));
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_byte (mb, CEE_LDIND_U4);
+
+               /* * n */
+               mono_mb_emit_ldarg (mb, 1);
+               mono_mb_emit_byte (mb, CEE_MUL_OVF_UN);
+               /* + sizeof (MonoArray) */
+               mono_mb_emit_icon (mb, sizeof (MonoArray));
+               mono_mb_emit_byte (mb, CEE_ADD_OVF_UN);
+               mono_mb_emit_stloc (mb, size_var);
+
+               pos_leave = mono_mb_emit_branch (mb, CEE_LEAVE);
+
+               /* catch */
+               clause->flags = MONO_EXCEPTION_CLAUSE_NONE;
+               clause->try_len = mono_mb_get_pos (mb) - clause->try_offset;
+               clause->data.catch_class = mono_class_from_name (mono_defaults.corlib,
+                               "System", "OverflowException");
+               g_assert (clause->data.catch_class);
+               clause->handler_offset = mono_mb_get_label (mb);
+
+               oom_exc_class = mono_class_from_name (mono_defaults.corlib,
+                               "System", "OutOfMemoryException");
+               g_assert (oom_exc_class);
+               ctor = mono_class_get_method_from_name (oom_exc_class, ".ctor", 0);
+               g_assert (ctor);
+
+               mono_mb_emit_byte (mb, CEE_POP);
+               mono_mb_emit_op (mb, CEE_NEWOBJ, ctor);
+               mono_mb_emit_byte (mb, CEE_THROW);
+
+               clause->handler_len = mono_mb_get_pos (mb) - clause->handler_offset;
+               mono_mb_set_clauses (mb, 1, clause);
+               mono_mb_patch_branch (mb, pos_leave);
+               /* end catch */
+       } else {
+               g_assert_not_reached ();
+       }
 
        /* size += ALLOC_ALIGN - 1; */
        mono_mb_emit_ldloc (mb, size_var);
@@ -5752,6 +7315,13 @@ create_allocator (int atype)
        mono_mb_emit_byte (mb, CEE_AND);
        mono_mb_emit_stloc (mb, size_var);
 
+       /* if (size > MAX_SMALL_OBJ_SIZE) goto slowpath */
+       if (atype != ATYPE_SMALL) {
+               mono_mb_emit_ldloc (mb, size_var);
+               mono_mb_emit_icon (mb, MAX_SMALL_OBJ_SIZE);
+               max_size_branch = mono_mb_emit_short_branch (mb, MONO_CEE_BGT_S);
+       }
+
        /*
         * We need to modify tlab_next, but the JIT only supports reading, so we read
         * another tls var holding its address instead.
@@ -5759,9 +7329,7 @@ create_allocator (int atype)
 
        /* tlab_next_addr (local) = tlab_next_addr (TLS var) */
        tlab_next_addr_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
-       mono_mb_emit_byte (mb, MONO_CUSTOM_PREFIX);
-       mono_mb_emit_byte (mb, CEE_MONO_TLS);
-       mono_mb_emit_i4 (mb, tlab_next_addr_offset);
+       EMIT_TLS_ACCESS (mb, tlab_next_addr, tlab_next_addr_offset);
        mono_mb_emit_stloc (mb, tlab_next_addr_var);
 
        /* p = (void**)tlab_next; */
@@ -5785,12 +7353,12 @@ create_allocator (int atype)
 
        /* if (G_LIKELY (new_next < tlab_temp_end)) */
        mono_mb_emit_ldloc (mb, new_next_var);
-       mono_mb_emit_byte (mb, MONO_CUSTOM_PREFIX);
-       mono_mb_emit_byte (mb, CEE_MONO_TLS);
-       mono_mb_emit_i4 (mb, tlab_temp_end_offset);
+       EMIT_TLS_ACCESS (mb, tlab_temp_end, tlab_temp_end_offset);
        slowpath_branch = mono_mb_emit_short_branch (mb, MONO_CEE_BLT_UN_S);
 
        /* Slowpath */
+       if (atype != ATYPE_SMALL)
+               mono_mb_patch_short_branch (mb, max_size_branch);
 
        mono_mb_emit_byte (mb, MONO_CUSTOM_PREFIX);
        mono_mb_emit_byte (mb, CEE_MONO_NOT_TAKEN);
@@ -5798,7 +7366,14 @@ create_allocator (int atype)
        /* FIXME: mono_gc_alloc_obj takes a 'size_t' as an argument, not an int32 */
        mono_mb_emit_ldarg (mb, 0);
        mono_mb_emit_ldloc (mb, size_var);
-       mono_mb_emit_icall (mb, mono_gc_alloc_obj);
+       if (atype == ATYPE_NORMAL || atype == ATYPE_SMALL) {
+               mono_mb_emit_icall (mb, mono_gc_alloc_obj);
+       } else if (atype == ATYPE_VECTOR) {
+               mono_mb_emit_ldarg (mb, 1);
+               mono_mb_emit_icall (mb, mono_gc_alloc_vector);
+       } else {
+               g_assert_not_reached ();
+       }
        mono_mb_emit_byte (mb, CEE_RET);
 
        /* Fastpath */
@@ -5810,7 +7385,15 @@ create_allocator (int atype)
        mono_mb_emit_ldloc (mb, p_var);
        mono_mb_emit_ldarg (mb, 0);
        mono_mb_emit_byte (mb, CEE_STIND_I);
-       
+
+       if (atype == ATYPE_VECTOR) {
+               /* arr->max_length = max_length; */
+               mono_mb_emit_ldloc (mb, p_var);
+               mono_mb_emit_ldflda (mb, G_STRUCT_OFFSET (MonoArray, max_length));
+               mono_mb_emit_ldarg (mb, 1);
+               mono_mb_emit_byte (mb, CEE_STIND_I);
+       }
+
        /* return p */
        mono_mb_emit_ldloc (mb, p_var);
        mono_mb_emit_byte (mb, CEE_RET);
@@ -5818,10 +7401,39 @@ create_allocator (int atype)
        res = mono_mb_create_method (mb, csig, 8);
        mono_mb_free (mb);
        mono_method_get_header (res)->init_locals = FALSE;
+
+       info = mono_image_alloc0 (mono_defaults.corlib, sizeof (AllocatorWrapperInfo));
+       info->alloc_type = atype;
+       mono_marshal_set_wrapper_info (res, info);
+
        return res;
 }
+#endif
 
 static MonoMethod* alloc_method_cache [ATYPE_NUM];
+static MonoMethod *write_barrier_method;
+
+static gboolean
+is_ip_in_managed_allocator (MonoDomain *domain, gpointer ip)
+{
+       MonoJitInfo *ji;
+       MonoMethod *method;
+       int i;
+
+       if (!ip || !domain)
+               return FALSE;
+       ji = mono_jit_info_table_find (domain, ip);
+       if (!ji)
+               return FALSE;
+       method = ji->method;
+
+       if (method == write_barrier_method)
+               return TRUE;
+       for (i = 0; i < ATYPE_NUM; ++i)
+               if (method == alloc_method_cache [i])
+                       return TRUE;
+       return FALSE;
+}
 
 /*
  * Generate an allocator method implementing the fast path of mono_gc_alloc_obj ().
@@ -5831,14 +7443,21 @@ static MonoMethod* alloc_method_cache [ATYPE_NUM];
 MonoMethod*
 mono_gc_get_managed_allocator (MonoVTable *vtable, gboolean for_box)
 {
+#ifdef MANAGED_ALLOCATION
+       MonoClass *klass = vtable->klass;
+
+#ifdef HAVE_KW_THREAD
        int tlab_next_offset = -1;
        int tlab_temp_end_offset = -1;
-       MonoClass *klass = vtable->klass;
        MONO_THREAD_VAR_OFFSET (tlab_next, tlab_next_offset);
        MONO_THREAD_VAR_OFFSET (tlab_temp_end, tlab_temp_end_offset);
 
        if (tlab_next_offset == -1 || tlab_temp_end_offset == -1)
                return NULL;
+#endif
+
+       if (!mono_runtime_has_tls_get ())
+               return NULL;
        if (klass->instance_size > tlab_size)
                return NULL;
        if (klass->has_finalize || klass->marshalbyref || (mono_profiler_get_events () & MONO_PROFILE_ALLOCATIONS))
@@ -5850,26 +7469,65 @@ mono_gc_get_managed_allocator (MonoVTable *vtable, gboolean for_box)
        if (collect_before_allocs)
                return NULL;
 
-       return mono_gc_get_managed_allocator_by_type (0);
+       if (ALIGN_TO (klass->instance_size, ALLOC_ALIGN) < MAX_SMALL_OBJ_SIZE)
+               return mono_gc_get_managed_allocator_by_type (ATYPE_SMALL);
+       else
+               return mono_gc_get_managed_allocator_by_type (ATYPE_NORMAL);
+#else
+       return NULL;
+#endif
 }
 
-int
-mono_gc_get_managed_allocator_type (MonoMethod *managed_alloc)
+MonoMethod*
+mono_gc_get_managed_array_allocator (MonoVTable *vtable, int rank)
 {
-       return 0;
+#ifdef MANAGED_ALLOCATION
+       MonoClass *klass = vtable->klass;
+
+#ifdef HAVE_KW_THREAD
+       int tlab_next_offset = -1;
+       int tlab_temp_end_offset = -1;
+       MONO_THREAD_VAR_OFFSET (tlab_next, tlab_next_offset);
+       MONO_THREAD_VAR_OFFSET (tlab_temp_end, tlab_temp_end_offset);
+
+       if (tlab_next_offset == -1 || tlab_temp_end_offset == -1)
+               return NULL;
+#endif
+
+       if (rank != 1)
+               return NULL;
+       if (!mono_runtime_has_tls_get ())
+               return NULL;
+       if (mono_profiler_get_events () & MONO_PROFILE_ALLOCATIONS)
+               return NULL;
+       if (collect_before_allocs)
+               return NULL;
+       g_assert (!klass->has_finalize && !klass->marshalbyref);
+
+       return mono_gc_get_managed_allocator_by_type (ATYPE_VECTOR);
+#else
+       return NULL;
+#endif
 }
 
 MonoMethod*
 mono_gc_get_managed_allocator_by_type (int atype)
 {
+#ifdef MANAGED_ALLOCATION
        MonoMethod *res;
 
+       if (!mono_runtime_has_tls_get ())
+               return NULL;
+
        mono_loader_lock ();
        res = alloc_method_cache [atype];
        if (!res)
                res = alloc_method_cache [atype] = create_allocator (atype);
        mono_loader_unlock ();
        return res;
+#else
+       return NULL;
+#endif
 }
 
 guint32
@@ -5878,19 +7536,31 @@ mono_gc_get_managed_allocator_types (void)
        return ATYPE_NUM;
 }
 
-static MonoMethod *write_barrier_method;
 
 MonoMethod*
 mono_gc_get_write_barrier (void)
 {
        MonoMethod *res;
-       int remset_offset = -1;
-       int remset_var, next_var;
        MonoMethodBuilder *mb;
        MonoMethodSignature *sig;
-       int label1, label2;
-
-       MONO_THREAD_VAR_OFFSET (remembered_set, remset_offset);
+#ifdef MANAGED_WBARRIER
+       int label_no_wb_1, label_no_wb_2, label_no_wb_3, label_no_wb_4, label_need_wb, label_slow_path;
+       int buffer_var, buffer_index_var, dummy_var;
+
+#ifdef HAVE_KW_THREAD
+       int stack_end_offset = -1, store_remset_buffer_offset = -1;
+       int store_remset_buffer_index_offset = -1, store_remset_buffer_index_addr_offset = -1;
+
+       MONO_THREAD_VAR_OFFSET (stack_end, stack_end_offset);
+       g_assert (stack_end_offset != -1);
+       MONO_THREAD_VAR_OFFSET (store_remset_buffer, store_remset_buffer_offset);
+       g_assert (store_remset_buffer_offset != -1);
+       MONO_THREAD_VAR_OFFSET (store_remset_buffer_index, store_remset_buffer_index_offset);
+       g_assert (store_remset_buffer_index_offset != -1);
+       MONO_THREAD_VAR_OFFSET (store_remset_buffer_index_addr, store_remset_buffer_index_addr_offset);
+       g_assert (store_remset_buffer_index_addr_offset != -1);
+#endif
+#endif
 
        // FIXME: Maybe create a separate version for ctors (the branch would be
        // correctly predicted more times)
@@ -5898,93 +7568,113 @@ mono_gc_get_write_barrier (void)
                return write_barrier_method;
 
        /* Create the IL version of mono_gc_barrier_generic_store () */
-       sig = mono_metadata_signature_alloc (mono_defaults.corlib, 2);
+       sig = mono_metadata_signature_alloc (mono_defaults.corlib, 1);
        sig->ret = &mono_defaults.void_class->byval_arg;
        sig->params [0] = &mono_defaults.int_class->byval_arg;
-       sig->params [1] = &mono_defaults.object_class->byval_arg;
 
        mb = mono_mb_new (mono_defaults.object_class, "wbarrier", MONO_WRAPPER_WRITE_BARRIER);
 
-       /* ptr_in_nursery () check */
+#ifdef MANAGED_WBARRIER
+       if (mono_runtime_has_tls_get ()) {
 #ifdef ALIGN_NURSERY
-       /* 
-        * Masking out the bits might be faster, but we would have to use 64 bit
-        * immediates, which might be slower.
-        */
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
-       mono_mb_emit_byte (mb, CEE_SHR_UN);
-       mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
-       label1 = mono_mb_emit_branch (mb, CEE_BNE_UN);
+               // if (ptr_in_nursery (ptr)) return;
+               /*
+                * Masking out the bits might be faster, but we would have to use 64 bit
+                * immediates, which might be slower.
+                */
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
+               mono_mb_emit_byte (mb, CEE_SHR_UN);
+               mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
+               label_no_wb_1 = mono_mb_emit_branch (mb, CEE_BEQ);
+
+               // if (!ptr_in_nursery (*ptr)) return;
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_byte (mb, CEE_LDIND_I);
+               mono_mb_emit_icon (mb, DEFAULT_NURSERY_BITS);
+               mono_mb_emit_byte (mb, CEE_SHR_UN);
+               mono_mb_emit_icon (mb, (mword)nursery_start >> DEFAULT_NURSERY_BITS);
+               label_no_wb_2 = mono_mb_emit_branch (mb, CEE_BNE_UN);
 #else
-       // FIXME:
-       g_assert_not_reached ();
-#endif
-
-       /* Don't need write barrier case */
-       /* do the assignment */
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_ldarg (mb, 1);
-       /* Don't use STIND_REF, as it would cause infinite recursion */
-       mono_mb_emit_byte (mb, CEE_STIND_I);
-       mono_mb_emit_byte (mb, CEE_RET);
-
-       /* Need write barrier case */
-       mono_mb_patch_branch (mb, label1);
-
-       if (remset_offset == -1)
                // FIXME:
                g_assert_not_reached ();
+#endif
 
-       // remset_var = remembered_set;
-       remset_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
-       mono_mb_emit_byte (mb, MONO_CUSTOM_PREFIX);
-       mono_mb_emit_byte (mb, CEE_MONO_TLS);
-       mono_mb_emit_i4 (mb, remset_offset);
-       mono_mb_emit_stloc (mb, remset_var);
-
-       // next_var = rs->store_next
-       next_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
-       mono_mb_emit_ldloc (mb, remset_var);
-       mono_mb_emit_ldflda (mb, G_STRUCT_OFFSET (RememberedSet, store_next));
-       mono_mb_emit_byte (mb, CEE_LDIND_I);
-       mono_mb_emit_stloc (mb, next_var);
-
-       // if (rs->store_next < rs->end_set) {
-       mono_mb_emit_ldloc (mb, next_var);
-       mono_mb_emit_ldloc (mb, remset_var);
-       mono_mb_emit_ldflda (mb, G_STRUCT_OFFSET (RememberedSet, end_set));
-       mono_mb_emit_byte (mb, CEE_LDIND_I);
-       label2 = mono_mb_emit_branch (mb, CEE_BGE);
-
-       /* write barrier fast path */
-       // *(rs->store_next++) = (mword)ptr;
-       mono_mb_emit_ldloc (mb, next_var);
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_byte (mb, CEE_STIND_I);
-
-       mono_mb_emit_ldloc (mb, next_var);
-       mono_mb_emit_icon (mb, sizeof (gpointer));
-       mono_mb_emit_byte (mb, CEE_ADD);
-       mono_mb_emit_stloc (mb, next_var);
-
-       mono_mb_emit_ldloc (mb, remset_var);
-       mono_mb_emit_ldflda (mb, G_STRUCT_OFFSET (RememberedSet, store_next));
-       mono_mb_emit_ldloc (mb, next_var);
-       mono_mb_emit_byte (mb, CEE_STIND_I);
-
-       // *(void**)ptr = value;
-       mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_ldarg (mb, 1);
-       mono_mb_emit_byte (mb, CEE_STIND_I);
-       mono_mb_emit_byte (mb, CEE_RET);
-
-       /* write barrier slow path */
-       mono_mb_patch_branch (mb, label2);
+               // if (ptr >= stack_end) goto need_wb;
+               mono_mb_emit_ldarg (mb, 0);
+               EMIT_TLS_ACCESS (mb, stack_end, stack_end_offset);
+               label_need_wb = mono_mb_emit_branch (mb, CEE_BGE_UN);
+
+               // if (ptr >= stack_start) return;
+               dummy_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_ldloc_addr (mb, dummy_var);
+               label_no_wb_3 = mono_mb_emit_branch (mb, CEE_BGE_UN);
+
+               // need_wb:
+               mono_mb_patch_branch (mb, label_need_wb);
+
+               // buffer = STORE_REMSET_BUFFER;
+               buffer_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
+               EMIT_TLS_ACCESS (mb, store_remset_buffer, store_remset_buffer_offset);
+               mono_mb_emit_stloc (mb, buffer_var);
+
+               // buffer_index = STORE_REMSET_BUFFER_INDEX;
+               buffer_index_var = mono_mb_add_local (mb, &mono_defaults.int_class->byval_arg);
+               EMIT_TLS_ACCESS (mb, store_remset_buffer_index, store_remset_buffer_index_offset);
+               mono_mb_emit_stloc (mb, buffer_index_var);
+
+               // if (buffer [buffer_index] == ptr) return;
+               mono_mb_emit_ldloc (mb, buffer_var);
+               mono_mb_emit_ldloc (mb, buffer_index_var);
+               g_assert (sizeof (gpointer) == 4 || sizeof (gpointer) == 8);
+               mono_mb_emit_icon (mb, sizeof (gpointer) == 4 ? 2 : 3);
+               mono_mb_emit_byte (mb, CEE_SHL);
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_byte (mb, CEE_LDIND_I);
+               mono_mb_emit_ldarg (mb, 0);
+               label_no_wb_4 = mono_mb_emit_branch (mb, CEE_BEQ);
+
+               // ++buffer_index;
+               mono_mb_emit_ldloc (mb, buffer_index_var);
+               mono_mb_emit_icon (mb, 1);
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_stloc (mb, buffer_index_var);
+
+               // if (buffer_index >= STORE_REMSET_BUFFER_SIZE) goto slow_path;
+               mono_mb_emit_ldloc (mb, buffer_index_var);
+               mono_mb_emit_icon (mb, STORE_REMSET_BUFFER_SIZE);
+               label_slow_path = mono_mb_emit_branch (mb, CEE_BGE);
+
+               // buffer [buffer_index] = ptr;
+               mono_mb_emit_ldloc (mb, buffer_var);
+               mono_mb_emit_ldloc (mb, buffer_index_var);
+               g_assert (sizeof (gpointer) == 4 || sizeof (gpointer) == 8);
+               mono_mb_emit_icon (mb, sizeof (gpointer) == 4 ? 2 : 3);
+               mono_mb_emit_byte (mb, CEE_SHL);
+               mono_mb_emit_byte (mb, CEE_ADD);
+               mono_mb_emit_ldarg (mb, 0);
+               mono_mb_emit_byte (mb, CEE_STIND_I);
+
+               // STORE_REMSET_BUFFER_INDEX = buffer_index;
+               EMIT_TLS_ACCESS (mb, store_remset_buffer_index_addr, store_remset_buffer_index_addr_offset);
+               mono_mb_emit_ldloc (mb, buffer_index_var);
+               mono_mb_emit_byte (mb, CEE_STIND_I);
+
+               // return;
+               mono_mb_patch_branch (mb, label_no_wb_1);
+               mono_mb_patch_branch (mb, label_no_wb_2);
+               mono_mb_patch_branch (mb, label_no_wb_3);
+               mono_mb_patch_branch (mb, label_no_wb_4);
+               mono_mb_emit_byte (mb, CEE_RET);
+
+               // slow path
+               mono_mb_patch_branch (mb, label_slow_path);
+       }
+#endif
 
        mono_mb_emit_ldarg (mb, 0);
-       mono_mb_emit_ldarg (mb, 1);
-       mono_mb_emit_icall (mb, mono_gc_wbarrier_generic_store);
+       mono_mb_emit_icall (mb, mono_gc_wbarrier_generic_nostore);
        mono_mb_emit_byte (mb, CEE_RET);
 
        res = mono_mb_create_method (mb, sig, 16);