Merge pull request #4618 from BrzVlad/feature-par-nrs
[mono.git] / mono / sgen / sgen-gc.c
index c689cd237b4ce9b4b35451f85f701988346f86f2..da44b24c334fd08a06ef1098599e8cfd0c55dd74 100644 (file)
@@ -1,5 +1,6 @@
-/*
- * sgen-gc.c: Simple generational GC.
+/**
+ * \file
+ * Simple generational GC.
  *
  * Author:
  *     Paolo Molaro (lupus@ximian.com)
 #include "mono/sgen/sgen-protocol.h"
 #include "mono/sgen/sgen-memory-governor.h"
 #include "mono/sgen/sgen-hash-table.h"
-#include "mono/sgen/sgen-cardtable.h"
 #include "mono/sgen/sgen-pinning.h"
 #include "mono/sgen/sgen-workers.h"
 #include "mono/sgen/sgen-client.h"
@@ -267,6 +267,8 @@ static guint64 stat_pinned_objects = 0;
 static guint64 time_minor_pre_collection_fragment_clear = 0;
 static guint64 time_minor_pinning = 0;
 static guint64 time_minor_scan_remsets = 0;
+static guint64 time_minor_scan_major_blocks = 0;
+static guint64 time_minor_scan_los = 0;
 static guint64 time_minor_scan_pinned = 0;
 static guint64 time_minor_scan_roots = 0;
 static guint64 time_minor_finish_gray_stack = 0;
@@ -426,8 +428,6 @@ sgen_workers_get_job_gray_queue (WorkerData *worker_data, SgenGrayQueue *default
 static void
 gray_queue_redirect (SgenGrayQueue *queue)
 {
-       SGEN_ASSERT (0, concurrent_collection_in_progress, "Where are we redirecting the gray queue to, without a concurrent collection?");
-
        sgen_workers_take_from_queue (queue);
 }
 
@@ -510,22 +510,9 @@ sgen_add_to_global_remset (gpointer ptr, GCObject *obj)
 gboolean
 sgen_drain_gray_stack (ScanCopyContext ctx)
 {
-       ScanObjectFunc scan_func = ctx.ops->scan_object;
-       SgenGrayQueue *queue = ctx.queue;
-
-       if (ctx.ops->drain_gray_stack)
-               return ctx.ops->drain_gray_stack (queue);
+       SGEN_ASSERT (0, ctx.ops->drain_gray_stack, "Why do we have a scan/copy context with a missing drain gray stack function?");
 
-       for (;;) {
-               GCObject *obj;
-               SgenDescriptor desc;
-               GRAY_OBJECT_DEQUEUE (queue, &obj, &desc);
-               if (!obj)
-                       return TRUE;
-               SGEN_LOG (9, "Precise gray object scan %p (%s)", obj, sgen_client_vtable_get_name (SGEN_LOAD_VTABLE (obj)));
-               scan_func (obj, desc, queue);
-       }
-       return FALSE;
+       return ctx.ops->drain_gray_stack (ctx.queue);
 }
 
 /*
@@ -674,7 +661,7 @@ pin_objects_from_nursery_pin_queue (gboolean do_scan_objects, ScanCopyContext ct
                                        safe_object_get_size (obj_to_pin));
 
                        pin_object (obj_to_pin);
-                       GRAY_OBJECT_ENQUEUE (queue, obj_to_pin, desc);
+                       GRAY_OBJECT_ENQUEUE_SERIAL (queue, obj_to_pin, desc);
                        sgen_pin_stats_register_object (obj_to_pin, GENERATION_NURSERY);
                        definitely_pinned [count] = obj_to_pin;
                        count++;
@@ -724,7 +711,7 @@ sgen_pin_object (GCObject *object, SgenGrayQueue *queue)
        ++objects_pinned;
        sgen_pin_stats_register_object (object, GENERATION_NURSERY);
 
-       GRAY_OBJECT_ENQUEUE (queue, object, sgen_obj_get_descriptor_safe (object));
+       GRAY_OBJECT_ENQUEUE_SERIAL (queue, object, sgen_obj_get_descriptor_safe (object));
 }
 
 /* Sort the addresses in array in increasing order.
@@ -872,6 +859,7 @@ static void
 precisely_scan_objects_from (void** start_root, void** end_root, char* n_start, char *n_end, SgenDescriptor desc, ScanCopyContext ctx)
 {
        CopyOrMarkObjectFunc copy_func = ctx.ops->copy_or_mark_object;
+       ScanPtrFieldFunc scan_field_func = ctx.ops->scan_ptr_field;
        SgenGrayQueue *queue = ctx.queue;
 
        switch (desc & ROOT_DESC_TYPE_MASK) {
@@ -906,6 +894,15 @@ precisely_scan_objects_from (void** start_root, void** end_root, char* n_start,
                }
                break;
        }
+       case ROOT_DESC_VECTOR: {
+               void **p;
+
+               for (p = start_root; p < end_root; p++) {
+                       if (*p)
+                               scan_field_func (NULL, (GCObject**)p, queue);
+               }
+               break;
+       }
        case ROOT_DESC_USER: {
                SgenUserRootMarkFunc marker = sgen_get_user_descriptor_func (desc);
                marker (start_root, single_arg_user_copy_or_mark, &ctx);
@@ -1236,6 +1233,8 @@ init_stats (void)
        mono_counters_register ("Minor fragment clear", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_pre_collection_fragment_clear);
        mono_counters_register ("Minor pinning", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_pinning);
        mono_counters_register ("Minor scan remembered set", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_scan_remsets);
+       mono_counters_register ("Minor scan major blocks", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_scan_major_blocks);
+       mono_counters_register ("Minor scan los", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_scan_los);
        mono_counters_register ("Minor scan pinned", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_scan_pinned);
        mono_counters_register ("Minor scan roots", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_scan_roots);
        mono_counters_register ("Minor fragment creation", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_minor_fragment_creation);
@@ -1336,12 +1335,6 @@ scan_copy_context_for_scan_job (void *worker_data_untyped, ScanJob *job)
        return CONTEXT_FROM_OBJECT_OPERATIONS (job->ops, sgen_workers_get_job_gray_queue (worker_data, job->gc_thread_gray_queue));
 }
 
-static void
-job_remembered_set_scan (void *worker_data_untyped, SgenThreadPoolJob *job)
-{
-       remset.scan_remsets (scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job));
-}
-
 typedef struct {
        ScanJob scan_job;
        char *heap_start;
@@ -1387,6 +1380,43 @@ job_scan_finalizer_entries (void *worker_data_untyped, SgenThreadPoolJob *job)
        scan_finalizer_entries (job_data->queue, ctx);
 }
 
+static void
+job_scan_wbroots (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       ScanJob *job_data = (ScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
+
+       sgen_wbroots_scan_card_table (ctx);
+}
+
+static void
+job_scan_major_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       SGEN_TV_GETTIME (atv);
+       major_collector.scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       SGEN_TV_GETTIME (btv);
+       time_minor_scan_major_blocks += SGEN_TV_ELAPSED (atv, btv);
+}
+
+static void
+job_scan_los_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       SGEN_TV_GETTIME (atv);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       SGEN_TV_GETTIME (btv);
+       time_minor_scan_los += SGEN_TV_ELAPSED (atv, btv);
+}
+
 static void
 job_scan_major_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
@@ -1478,6 +1508,34 @@ init_gray_queue (SgenGrayQueue *gc_thread_gray_queue, gboolean use_workers)
        sgen_gray_object_queue_init (gc_thread_gray_queue, NULL, TRUE);
 }
 
+static void
+enqueue_scan_remembered_set_jobs (SgenGrayQueue *gc_thread_gray_queue, SgenObjectOperations *ops, gboolean enqueue)
+{
+       int i, split_count = sgen_workers_get_job_split_count ();
+       ScanJob *sj;
+
+       sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan wbroots", job_scan_wbroots, sizeof (ScanJob));
+       sj->ops = ops;
+       sj->gc_thread_gray_queue = gc_thread_gray_queue;
+       sgen_workers_enqueue_job (&sj->job, enqueue);
+
+       for (i = 0; i < split_count; i++) {
+               ParallelScanJob *psj;
+
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan major remsets", job_scan_major_card_table, sizeof (ParallelScanJob));
+               psj->scan_job.ops = ops;
+               psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, enqueue);
+
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS remsets", job_scan_los_card_table, sizeof (ParallelScanJob));
+               psj->scan_job.ops = ops;
+               psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, enqueue);
+       }
+}
+
 static void
 enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_start, char *heap_end, SgenObjectOperations *ops, gboolean enqueue)
 {
@@ -1495,13 +1553,16 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
        scrrj->root_type = ROOT_TYPE_NORMAL;
        sgen_workers_enqueue_job (&scrrj->scan_job.job, enqueue);
 
-       scrrj = (ScanFromRegisteredRootsJob*)sgen_thread_pool_job_alloc ("scan from registered roots wbarrier", job_scan_from_registered_roots, sizeof (ScanFromRegisteredRootsJob));
-       scrrj->scan_job.ops = ops;
-       scrrj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
-       scrrj->heap_start = heap_start;
-       scrrj->heap_end = heap_end;
-       scrrj->root_type = ROOT_TYPE_WBARRIER;
-       sgen_workers_enqueue_job (&scrrj->scan_job.job, enqueue);
+       if (current_collection_generation == GENERATION_OLD) {
+               /* During minors we scan the cardtable for these roots instead */
+               scrrj = (ScanFromRegisteredRootsJob*)sgen_thread_pool_job_alloc ("scan from registered roots wbarrier", job_scan_from_registered_roots, sizeof (ScanFromRegisteredRootsJob));
+               scrrj->scan_job.ops = ops;
+               scrrj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
+               scrrj->heap_start = heap_start;
+               scrrj->heap_end = heap_end;
+               scrrj->root_type = ROOT_TYPE_WBARRIER;
+               sgen_workers_enqueue_job (&scrrj->scan_job.job, enqueue);
+       }
 
        /* Threads */
 
@@ -1535,13 +1596,12 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
 static gboolean
 collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_queue)
 {
-       gboolean needs_major;
+       gboolean needs_major, is_parallel = FALSE;
        size_t max_garbage_amount;
        char *nursery_next;
        mword fragment_total;
-       ScanJob *sj;
        SgenGrayQueue gc_thread_gray_queue;
-       SgenObjectOperations *object_ops;
+       SgenObjectOperations *object_ops_nopar, *object_ops_par = NULL;
        ScanCopyContext ctx;
        TV_DECLARE (atv);
        TV_DECLARE (btv);
@@ -1556,10 +1616,16 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        binary_protocol_collection_begin (gc_stats.minor_gc_count, GENERATION_NURSERY);
 
-       if (sgen_concurrent_collection_in_progress ())
-               object_ops = &sgen_minor_collector.serial_ops_with_concurrent_major;
-       else
-               object_ops = &sgen_minor_collector.serial_ops;
+       if (sgen_concurrent_collection_in_progress ()) {
+               /* FIXME Support parallel nursery collections with concurrent major */
+               object_ops_nopar = &sgen_minor_collector.serial_ops_with_concurrent_major;
+       } else {
+               object_ops_nopar = &sgen_minor_collector.serial_ops;
+               if (sgen_minor_collector.is_parallel) {
+                       object_ops_par = &sgen_minor_collector.parallel_ops;
+                       is_parallel = TRUE;
+               }
+       }
 
        if (do_verify_nursery || do_dump_nursery_content)
                sgen_debug_verify_nursery (do_dump_nursery_content);
@@ -1596,8 +1662,8 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        sgen_memgov_minor_collection_start ();
 
-       init_gray_queue (&gc_thread_gray_queue, FALSE);
-       ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, &gc_thread_gray_queue);
+       init_gray_queue (&gc_thread_gray_queue, is_parallel);
+       ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, &gc_thread_gray_queue);
 
        gc_stats.minor_gc_count ++;
 
@@ -1629,10 +1695,9 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        SGEN_LOG (2, "Finding pinned pointers: %zd in %lld usecs", sgen_get_pinned_count (), (long long)TV_ELAPSED (btv, atv));
        SGEN_LOG (4, "Start scan with %zd pinned objects", sgen_get_pinned_count ());
 
-       sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan remset", job_remembered_set_scan, sizeof (ScanJob));
-       sj->ops = object_ops;
-       sj->gc_thread_gray_queue = &gc_thread_gray_queue;
-       sgen_workers_enqueue_job (&sj->job, FALSE);
+       remset.start_scan_remsets ();
+
+       enqueue_scan_remembered_set_jobs (&gc_thread_gray_queue, is_parallel ? object_ops_par : object_ops_nopar, is_parallel);
 
        /* we don't have complete write barrier yet, so we scan all the old generation sections */
        TV_GETTIME (btv);
@@ -1647,7 +1712,13 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        TV_GETTIME (atv);
        time_minor_scan_pinned += TV_ELAPSED (btv, atv);
 
-       enqueue_scan_from_roots_jobs (&gc_thread_gray_queue, sgen_get_nursery_start (), nursery_next, object_ops, FALSE);
+       enqueue_scan_from_roots_jobs (&gc_thread_gray_queue, sgen_get_nursery_start (), nursery_next, is_parallel ? object_ops_par : object_ops_nopar, is_parallel);
+
+       if (is_parallel) {
+               gray_queue_redirect (&gc_thread_gray_queue);
+               sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+               sgen_workers_join ();
+       }
 
        TV_GETTIME (btv);
        time_minor_scan_roots += TV_ELAPSED (atv, btv);
@@ -1709,8 +1780,6 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        sgen_gray_object_queue_dispose (&gc_thread_gray_queue);
 
-       remset.finish_minor_collection ();
-
        check_scan_starts ();
 
        binary_protocol_flush_buffers (FALSE);
@@ -1737,7 +1806,7 @@ typedef enum {
 } CopyOrMarkFromRootsMode;
 
 static void
-major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops, SgenObjectOperations *worker_object_ops)
+major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par)
 {
        LOSObject *bigobj;
        TV_DECLARE (atv);
@@ -1747,7 +1816,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
         */
        char *heap_start = NULL;
        char *heap_end = (char*)-1;
-       ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue);
+       ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue);
        gboolean concurrent = mode != COPY_OR_MARK_FROM_ROOTS_SERIAL;
 
        SGEN_ASSERT (0, !!concurrent == !!concurrent_collection_in_progress, "We've been called with the wrong mode.");
@@ -1840,7 +1909,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                        }
                        sgen_los_pin_object (bigobj->data);
                        if (SGEN_OBJECT_HAS_REFERENCES (bigobj->data))
-                               GRAY_OBJECT_ENQUEUE (gc_thread_gray_queue, bigobj->data, sgen_obj_get_descriptor ((GCObject*)bigobj->data));
+                               GRAY_OBJECT_ENQUEUE_SERIAL (gc_thread_gray_queue, bigobj->data, sgen_obj_get_descriptor ((GCObject*)bigobj->data));
                        sgen_pin_stats_register_object (bigobj->data, GENERATION_OLD);
                        SGEN_LOG (6, "Marked large object %p (%s) size: %lu from roots", bigobj->data,
                                        sgen_client_vtable_get_name (SGEN_LOAD_VTABLE (bigobj->data)),
@@ -1867,12 +1936,14 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
 
        SGEN_ASSERT (0, sgen_workers_all_done (), "Why are the workers not done when we start or finish a major collection?");
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
+               if (object_ops_par != NULL)
+                       sgen_workers_set_num_active_workers (0);
                if (sgen_workers_have_idle_work ()) {
                        /*
                         * We force the finish of the worker with the new object ops context
                         * which can also do copying. We need to have finished pinning.
                         */
-                       sgen_workers_start_all_workers (worker_object_ops, NULL);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
 
                        sgen_workers_join ();
                }
@@ -1889,7 +1960,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
 
        sgen_client_collecting_major_3 (&fin_ready_queue, &critical_fin_queue);
 
-       enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops, FALSE);
+       enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops_nopar, FALSE);
 
        TV_GETTIME (btv);
        time_major_scan_roots += TV_ELAPSED (atv, btv);
@@ -1900,46 +1971,52 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
         * the roots.
         */
        if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT) {
+               sgen_workers_set_num_active_workers (1);
                gray_queue_redirect (gc_thread_gray_queue);
                if (precleaning_enabled) {
-                       sgen_workers_start_all_workers (worker_object_ops, workers_finish_callback);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, workers_finish_callback);
                } else {
-                       sgen_workers_start_all_workers (worker_object_ops, NULL);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
                }
        }
 
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
                int i, split_count = sgen_workers_get_job_split_count ();
+               gboolean parallel = object_ops_par != NULL;
 
-               gray_queue_redirect (gc_thread_gray_queue);
+               /* If we're not parallel we finish the collection on the gc thread */
+               if (parallel)
+                       gray_queue_redirect (gc_thread_gray_queue);
 
                /* Mod union card table */
                for (i = 0; i < split_count; i++) {
                        ParallelScanJob *psj;
 
                        psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ParallelScanJob));
-                       psj->scan_job.ops = worker_object_ops;
-                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar;
+                       psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                        psj->job_index = i;
-                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+                       sgen_workers_enqueue_job (&psj->scan_job.job, parallel);
 
                        psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob));
-                       psj->scan_job.ops = worker_object_ops;
-                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar;
+                       psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                        psj->job_index = i;
-                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+                       sgen_workers_enqueue_job (&psj->scan_job.job, parallel);
                }
 
-               /*
-                * If we enqueue a job while workers are running we need to sgen_workers_ensure_awake
-                * in order to make sure that we are running the idle func and draining all worker
-                * gray queues. The operation of starting workers implies this, so we start them after
-                * in order to avoid doing this operation twice. The workers will drain the main gray
-                * stack that contained roots and pinned objects and also scan the mod union card
-                * table.
-                */
-               sgen_workers_start_all_workers (worker_object_ops, NULL);
-               sgen_workers_join ();
+               if (parallel) {
+                       /*
+                        * If we enqueue a job while workers are running we need to sgen_workers_ensure_awake
+                        * in order to make sure that we are running the idle func and draining all worker
+                        * gray queues. The operation of starting workers implies this, so we start them after
+                        * in order to avoid doing this operation twice. The workers will drain the main gray
+                        * stack that contained roots and pinned objects and also scan the mod union card
+                        * table.
+                        */
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+                       sgen_workers_join ();
+               }
        }
 
        sgen_pin_stats_report ();
@@ -1957,7 +2034,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
 static void
 major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean concurrent, size_t *old_next_pin_slot)
 {
-       SgenObjectOperations *object_ops;
+       SgenObjectOperations *object_ops_nopar, *object_ops_par = NULL;
 
        binary_protocol_collection_begin (gc_stats.major_gc_count, GENERATION_OLD);
 
@@ -1972,13 +2049,12 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
                g_assert (major_collector.is_concurrent);
                concurrent_collection_in_progress = TRUE;
 
+               object_ops_nopar = &major_collector.major_ops_concurrent_start;
                if (major_collector.is_parallel)
-                       object_ops = &major_collector.major_ops_conc_par_start;
-               else
-                       object_ops = &major_collector.major_ops_concurrent_start;
+                       object_ops_par = &major_collector.major_ops_conc_par_start;
 
        } else {
-               object_ops = &major_collector.major_ops_serial;
+               object_ops_nopar = &major_collector.major_ops_serial;
        }
 
        reset_pinned_from_failed_allocation ();
@@ -1997,14 +2073,14 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
        if (major_collector.start_major_collection)
                major_collector.start_major_collection ();
 
-       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops, object_ops);
+       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops_nopar, object_ops_par);
 }
 
 static void
 major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean is_overflow, size_t old_next_pin_slot, gboolean forced)
 {
        ScannedObjectCounts counts;
-       SgenObjectOperations *object_ops;
+       SgenObjectOperations *object_ops_nopar;
        mword fragment_total;
        TV_DECLARE (atv);
        TV_DECLARE (btv);
@@ -2012,25 +2088,24 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        TV_GETTIME (btv);
 
        if (concurrent_collection_in_progress) {
-               SgenObjectOperations *worker_object_ops;
-               object_ops = &major_collector.major_ops_concurrent_finish;
+               SgenObjectOperations *object_ops_par = NULL;
+
+               object_ops_nopar = &major_collector.major_ops_concurrent_finish;
                if (major_collector.is_parallel)
-                       worker_object_ops = &major_collector.major_ops_conc_par_finish;
-               else
-                       worker_object_ops = object_ops;
+                       object_ops_par = &major_collector.major_ops_conc_par_finish;
 
-               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops, worker_object_ops);
+               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops_nopar, object_ops_par);
 
 #ifdef SGEN_DEBUG_INTERNAL_ALLOC
                main_gc_thread = NULL;
 #endif
        } else {
-               object_ops = &major_collector.major_ops_serial;
+               object_ops_nopar = &major_collector.major_ops_serial;
        }
 
        sgen_workers_assert_gray_queue_is_empty ();
 
-       finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue));
+       finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue));
        TV_GETTIME (atv);
        time_major_finish_gray_stack += TV_ELAPSED (btv, atv);
 
@@ -2613,6 +2688,94 @@ sgen_deregister_root (char* addr)
        UNLOCK_GC;
 }
 
+void
+sgen_wbroots_iterate_live_block_ranges (sgen_cardtable_block_callback cb)
+{
+       void **start_root;
+       RootRecord *root;
+       SGEN_HASH_TABLE_FOREACH (&roots_hash [ROOT_TYPE_WBARRIER], void **, start_root, RootRecord *, root) {
+               cb ((mword)start_root, (mword)root->end_root - (mword)start_root);
+       } SGEN_HASH_TABLE_FOREACH_END;
+}
+
+/* Root equivalent of sgen_client_cardtable_scan_object */
+static void
+sgen_wbroot_scan_card_table (void** start_root, mword size,  ScanCopyContext ctx)
+{
+       ScanPtrFieldFunc scan_field_func = ctx.ops->scan_ptr_field;
+       guint8 *card_data = sgen_card_table_get_card_scan_address ((mword)start_root);
+       guint8 *card_base = card_data;
+       mword card_count = sgen_card_table_number_of_cards_in_range ((mword)start_root, size);
+       guint8 *card_data_end = card_data + card_count;
+       mword extra_idx = 0;
+       char *obj_start = sgen_card_table_align_pointer (start_root);
+       char *obj_end = (char*)start_root + size;
+#ifdef SGEN_HAVE_OVERLAPPING_CARDS
+       guint8 *overflow_scan_end = NULL;
+#endif
+
+#ifdef SGEN_HAVE_OVERLAPPING_CARDS
+       /*Check for overflow and if so, setup to scan in two steps*/
+       if (card_data_end >= SGEN_SHADOW_CARDTABLE_END) {
+               overflow_scan_end = sgen_shadow_cardtable + (card_data_end - SGEN_SHADOW_CARDTABLE_END);
+               card_data_end = SGEN_SHADOW_CARDTABLE_END;
+       }
+
+LOOP_HEAD:
+#endif
+
+       card_data = sgen_find_next_card (card_data, card_data_end);
+
+       for (; card_data < card_data_end; card_data = sgen_find_next_card (card_data + 1, card_data_end)) {
+               size_t idx = (card_data - card_base) + extra_idx;
+               char *start = (char*)(obj_start + idx * CARD_SIZE_IN_BYTES);
+               char *card_end = start + CARD_SIZE_IN_BYTES;
+               char *elem = start, *first_elem = start;
+
+               /*
+                * Don't clean first and last card on 32bit systems since they
+                * may also be part from other roots.
+                */
+               if (card_data != card_base && card_data != (card_data_end - 1))
+                       sgen_card_table_prepare_card_for_scanning (card_data);
+
+               card_end = MIN (card_end, obj_end);
+
+               if (elem < (char*)start_root)
+                       first_elem = elem = (char*)start_root;
+
+               for (; elem < card_end; elem += SIZEOF_VOID_P) {
+                       if (*(GCObject**)elem)
+                               scan_field_func (NULL, (GCObject**)elem, ctx.queue);
+               }
+
+               binary_protocol_card_scan (first_elem, elem - first_elem);
+       }
+
+#ifdef SGEN_HAVE_OVERLAPPING_CARDS
+       if (overflow_scan_end) {
+               extra_idx = card_data - card_base;
+               card_base = card_data = sgen_shadow_cardtable;
+               card_data_end = overflow_scan_end;
+               overflow_scan_end = NULL;
+               goto LOOP_HEAD;
+       }
+#endif
+}
+
+void
+sgen_wbroots_scan_card_table (ScanCopyContext ctx)
+{
+       void **start_root;
+       RootRecord *root;
+
+       SGEN_HASH_TABLE_FOREACH (&roots_hash [ROOT_TYPE_WBARRIER], void **, start_root, RootRecord *, root) {
+               SGEN_ASSERT (0, (root->root_desc & ROOT_DESC_TYPE_MASK) == ROOT_DESC_VECTOR, "Unsupported root type");
+
+               sgen_wbroot_scan_card_table (start_root, (mword)root->end_root - (mword)start_root, ctx);
+       } SGEN_HASH_TABLE_FOREACH_END;
+}
+
 /*
  * ######################################################################
  * ########  Thread handling (stop/start code)
@@ -2654,6 +2817,9 @@ sgen_thread_unregister (SgenThreadInfo *p)
  * the conservative scan, otherwise by the remembered set scan.
  */
 
+/**
+ * mono_gc_wbarrier_arrayref_copy:
+ */
 void
 mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
 {
@@ -2679,6 +2845,9 @@ mono_gc_wbarrier_arrayref_copy (gpointer dest_ptr, gpointer src_ptr, int count)
        remset.wbarrier_arrayref_copy (dest_ptr, src_ptr, count);
 }
 
+/**
+ * mono_gc_wbarrier_generic_nostore:
+ */
 void
 mono_gc_wbarrier_generic_nostore (gpointer ptr)
 {
@@ -2706,6 +2875,9 @@ mono_gc_wbarrier_generic_nostore (gpointer ptr)
        remset.wbarrier_generic_nostore (ptr);
 }
 
+/**
+ * mono_gc_wbarrier_generic_store:
+ */
 void
 mono_gc_wbarrier_generic_store (gpointer ptr, GCObject* value)
 {
@@ -2716,7 +2888,9 @@ mono_gc_wbarrier_generic_store (gpointer ptr, GCObject* value)
        sgen_dummy_use (value);
 }
 
-/* Same as mono_gc_wbarrier_generic_store () but performs the store
+/**
+ * mono_gc_wbarrier_generic_store_atomic:
+ * Same as \c mono_gc_wbarrier_generic_store but performs the store
  * as an atomic operation with release semantics.
  */
 void
@@ -2825,7 +2999,7 @@ parse_double_in_interval (const char *env_var, const char *opt_name, const char
 void
 sgen_gc_init (void)
 {
-       const char *env;
+       char *env;
        char **opts, **ptr;
        char *major_collector_opt = NULL;
        char *minor_collector_opt = NULL;
@@ -2870,6 +3044,7 @@ sgen_gc_init (void)
 
        if ((env = g_getenv (MONO_GC_PARAMS_NAME)) || gc_params_options) {
                params_opts = g_strdup_printf ("%s,%s", gc_params_options ? gc_params_options : "", env ? env : "");
+               g_free (env);
        }
 
        if (params_opts) {
@@ -2904,11 +3079,13 @@ sgen_gc_init (void)
        sgen_client_init ();
 
        if (!minor_collector_opt) {
-               sgen_simple_nursery_init (&sgen_minor_collector);
+               sgen_simple_nursery_init (&sgen_minor_collector, FALSE);
        } else {
                if (!strcmp (minor_collector_opt, "simple")) {
                use_simple_nursery:
-                       sgen_simple_nursery_init (&sgen_minor_collector);
+                       sgen_simple_nursery_init (&sgen_minor_collector, FALSE);
+               } else if (!strcmp (minor_collector_opt, "simple-par")) {
+                       sgen_simple_nursery_init (&sgen_minor_collector, TRUE);
                } else if (!strcmp (minor_collector_opt, "split")) {
                        sgen_split_nursery_init (&sgen_minor_collector);
                } else {
@@ -3087,6 +3264,7 @@ sgen_gc_init (void)
 
        if ((env = g_getenv (MONO_GC_DEBUG_NAME)) || gc_debug_options) {
                debug_opts = g_strdup_printf ("%s,%s", gc_debug_options ? gc_debug_options  : "", env ? env : "");
+               g_free (env);
        }
 
        if (debug_opts) {
@@ -3232,9 +3410,9 @@ sgen_gc_init (void)
        if (major_collector.post_param_init)
                major_collector.post_param_init (&major_collector);
 
-       if (major_collector.needs_thread_pool) {
+       if (major_collector.is_concurrent || sgen_minor_collector.is_parallel) {
                int num_workers = 1;
-               if (major_collector.is_parallel) {
+               if (major_collector.is_parallel || sgen_minor_collector.is_parallel) {
                        /* FIXME Detect the number of physical cores, instead of logical */
                        num_workers = mono_cpu_count () / 2;
                        if (num_workers < 1)
@@ -3298,6 +3476,12 @@ sgen_get_major_collector (void)
        return &major_collector;
 }
 
+SgenMinorCollector*
+sgen_get_minor_collector (void)
+{
+       return &sgen_minor_collector;
+}
+
 SgenRememberedSet*
 sgen_get_remset (void)
 {