[sgen] Add explicit binary protocol stats for collections
[mono.git] / mono / sgen / sgen-gc.c
index 6cd939553a99cb91d0553e6a2e80e035fccbae43..ea67897afca98cbae0e5ec460916cdab92894ee1 100644 (file)
@@ -282,7 +282,8 @@ static guint64 time_major_pre_collection_fragment_clear = 0;
 static guint64 time_major_pinning = 0;
 static guint64 time_major_scan_pinned = 0;
 static guint64 time_major_scan_roots = 0;
-static guint64 time_major_scan_mod_union = 0;
+static guint64 time_major_scan_mod_union_blocks = 0;
+static guint64 time_major_scan_mod_union_los = 0;
 static guint64 time_major_finish_gray_stack = 0;
 static guint64 time_major_free_bigobjs = 0;
 static guint64 time_major_los_sweep = 0;
@@ -454,7 +455,7 @@ sgen_workers_get_job_gray_queue (WorkerData *worker_data, SgenGrayQueue *default
 static void
 gray_queue_redirect (SgenGrayQueue *queue)
 {
-       sgen_workers_take_from_queue (queue);
+       sgen_workers_take_from_queue (current_collection_generation, queue);
 }
 
 void
@@ -1279,7 +1280,8 @@ init_stats (void)
        mono_counters_register ("Major pinning", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_pinning);
        mono_counters_register ("Major scan pinned", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_pinned);
        mono_counters_register ("Major scan roots", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_roots);
-       mono_counters_register ("Major scan mod union", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union);
+       mono_counters_register ("Major scan mod union blocks", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union_blocks);
+       mono_counters_register ("Major scan mod union los", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union_los);
        mono_counters_register ("Major finish gray stack", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_finish_gray_stack);
        mono_counters_register ("Major free big objects", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_free_bigobjs);
        mono_counters_register ("Major LOS sweep", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_los_sweep);
@@ -1360,7 +1362,7 @@ typedef struct {
 
 typedef struct {
        ScanJob scan_job;
-       int job_index;
+       int job_index, job_split_count;
 } ParallelScanJob;
 
 static ScanCopyContext
@@ -1375,7 +1377,7 @@ scan_copy_context_for_scan_job (void *worker_data_untyped, ScanJob *job)
                 * object ops changes, like forced concurrent finish.
                 */
                SGEN_ASSERT (0, sgen_workers_is_worker_thread (mono_native_thread_id_get ()), "We need a context for the scan job");
-               job->ops = sgen_workers_get_idle_func_object_ops ();
+               job->ops = sgen_workers_get_idle_func_object_ops (worker_data);
        }
 
        return CONTEXT_FROM_OBJECT_OPERATIONS (job->ops, sgen_workers_get_job_gray_queue (worker_data, job->gc_thread_gray_queue));
@@ -1444,9 +1446,12 @@ job_scan_major_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        SGEN_TV_GETTIME (atv);
-       major_collector.scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       major_collector.scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, job_data->job_split_count);
        SGEN_TV_GETTIME (btv);
        time_minor_scan_major_blocks += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
@@ -1458,51 +1463,82 @@ job_scan_los_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        SGEN_TV_GETTIME (atv);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, job_data->job_split_count);
        SGEN_TV_GETTIME (btv);
        time_minor_scan_los += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_scan_major_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       SGEN_TV_GETTIME (atv);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
+       time_major_scan_mod_union_blocks += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_scan_los_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       SGEN_TV_GETTIME (atv);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
+       time_major_scan_mod_union_los += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_major_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
+       SGEN_TV_GETTIME (atv);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
 
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       g_assert (worker_data_untyped);
+       ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_los_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
+       SGEN_TV_GETTIME (atv);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
 
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+       g_assert (worker_data_untyped);
+       ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
@@ -1521,46 +1557,46 @@ workers_finish_callback (void)
 {
        ParallelScanJob *psj;
        ScanJob *sj;
-       int split_count = sgen_workers_get_job_split_count ();
+       int split_count = sgen_workers_get_job_split_count (GENERATION_OLD);
        int i;
        /* Mod union preclean jobs */
        for (i = 0; i < split_count; i++) {
                psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean major mod union cardtable", job_major_mod_union_preclean, sizeof (ParallelScanJob));
                psj->scan_job.gc_thread_gray_queue = NULL;
                psj->job_index = i;
-               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+               psj->job_split_count = split_count;
+               sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, TRUE);
        }
 
        for (i = 0; i < split_count; i++) {
                psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean los mod union cardtable", job_los_mod_union_preclean, sizeof (ParallelScanJob));
                psj->scan_job.gc_thread_gray_queue = NULL;
                psj->job_index = i;
-               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+               psj->job_split_count = split_count;
+               sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, TRUE);
        }
 
        sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan last pinned", job_scan_last_pinned, sizeof (ScanJob));
        sj->gc_thread_gray_queue = NULL;
-       sgen_workers_enqueue_job (&sj->job, TRUE);
+       sgen_workers_enqueue_job (GENERATION_OLD, &sj->job, TRUE);
 }
 
 static void
-init_gray_queue (SgenGrayQueue *gc_thread_gray_queue, gboolean use_workers)
+init_gray_queue (SgenGrayQueue *gc_thread_gray_queue)
 {
-       if (use_workers)
-               sgen_workers_init_distribute_gray_queue ();
        sgen_gray_object_queue_init (gc_thread_gray_queue, NULL, TRUE);
 }
 
 static void
 enqueue_scan_remembered_set_jobs (SgenGrayQueue *gc_thread_gray_queue, SgenObjectOperations *ops, gboolean enqueue)
 {
-       int i, split_count = sgen_workers_get_job_split_count ();
+       int i, split_count = sgen_workers_get_job_split_count (GENERATION_NURSERY);
        ScanJob *sj;
 
        sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan wbroots", job_scan_wbroots, sizeof (ScanJob));
        sj->ops = ops;
        sj->gc_thread_gray_queue = gc_thread_gray_queue;
-       sgen_workers_enqueue_job (&sj->job, enqueue);
+       sgen_workers_enqueue_job (GENERATION_NURSERY, &sj->job, enqueue);
 
        for (i = 0; i < split_count; i++) {
                ParallelScanJob *psj;
@@ -1569,13 +1605,15 @@ enqueue_scan_remembered_set_jobs (SgenGrayQueue *gc_thread_gray_queue, SgenObjec
                psj->scan_job.ops = ops;
                psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                psj->job_index = i;
-               sgen_workers_enqueue_job (&psj->scan_job.job, enqueue);
+               psj->job_split_count = split_count;
+               sgen_workers_enqueue_job (GENERATION_NURSERY, &psj->scan_job.job, enqueue);
 
                psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS remsets", job_scan_los_card_table, sizeof (ParallelScanJob));
                psj->scan_job.ops = ops;
                psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                psj->job_index = i;
-               sgen_workers_enqueue_job (&psj->scan_job.job, enqueue);
+               psj->job_split_count = split_count;
+               sgen_workers_enqueue_job (GENERATION_NURSERY, &psj->scan_job.job, enqueue);
        }
 }
 
@@ -1594,7 +1632,7 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
        scrrj->heap_start = heap_start;
        scrrj->heap_end = heap_end;
        scrrj->root_type = ROOT_TYPE_NORMAL;
-       sgen_workers_enqueue_job (&scrrj->scan_job.job, enqueue);
+       sgen_workers_enqueue_job (current_collection_generation, &scrrj->scan_job.job, enqueue);
 
        if (current_collection_generation == GENERATION_OLD) {
                /* During minors we scan the cardtable for these roots instead */
@@ -1604,7 +1642,7 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
                scrrj->heap_start = heap_start;
                scrrj->heap_end = heap_end;
                scrrj->root_type = ROOT_TYPE_WBARRIER;
-               sgen_workers_enqueue_job (&scrrj->scan_job.job, enqueue);
+               sgen_workers_enqueue_job (current_collection_generation, &scrrj->scan_job.job, enqueue);
        }
 
        /* Threads */
@@ -1614,7 +1652,7 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
        stdj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
        stdj->heap_start = heap_start;
        stdj->heap_end = heap_end;
-       sgen_workers_enqueue_job (&stdj->scan_job.job, enqueue);
+       sgen_workers_enqueue_job (current_collection_generation, &stdj->scan_job.job, enqueue);
 
        /* Scan the list of objects ready for finalization. */
 
@@ -1622,13 +1660,13 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
        sfej->scan_job.ops = ops;
        sfej->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
        sfej->queue = &fin_ready_queue;
-       sgen_workers_enqueue_job (&sfej->scan_job.job, enqueue);
+       sgen_workers_enqueue_job (current_collection_generation, &sfej->scan_job.job, enqueue);
 
        sfej = (ScanFinalizerEntriesJob*)sgen_thread_pool_job_alloc ("scan critical finalizer entries", job_scan_finalizer_entries, sizeof (ScanFinalizerEntriesJob));
        sfej->scan_job.ops = ops;
        sfej->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
        sfej->queue = &critical_fin_queue;
-       sgen_workers_enqueue_job (&sfej->scan_job.job, enqueue);
+       sgen_workers_enqueue_job (current_collection_generation, &sfej->scan_job.job, enqueue);
 }
 
 /*
@@ -1648,6 +1686,9 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        TV_DECLARE (btv);
        SGEN_TV_DECLARE (last_minor_collection_start_tv);
        SGEN_TV_DECLARE (last_minor_collection_end_tv);
+       guint64 major_scan_start = time_minor_scan_major_blocks;
+       guint64 los_scan_start = time_minor_scan_los;
+       guint64 finish_gray_start = time_minor_finish_gray_stack;
 
        if (disable_minor_collections)
                return TRUE;
@@ -1657,15 +1698,14 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        binary_protocol_collection_begin (gc_stats.minor_gc_count, GENERATION_NURSERY);
 
-       if (sgen_concurrent_collection_in_progress ()) {
-               /* FIXME Support parallel nursery collections with concurrent major */
-               object_ops_nopar = &sgen_minor_collector.serial_ops_with_concurrent_major;
-       } else {
-               object_ops_nopar = &sgen_minor_collector.serial_ops;
-               if (sgen_minor_collector.is_parallel && sgen_nursery_size >= SGEN_PARALLEL_MINOR_MIN_NURSERY_SIZE) {
-                       object_ops_par = &sgen_minor_collector.parallel_ops;
-                       is_parallel = TRUE;
-               }
+       object_ops_nopar = sgen_concurrent_collection_in_progress ()
+                               ? &sgen_minor_collector.serial_ops_with_concurrent_major
+                               : &sgen_minor_collector.serial_ops;
+       if (sgen_minor_collector.is_parallel && sgen_nursery_size >= SGEN_PARALLEL_MINOR_MIN_NURSERY_SIZE) {
+               object_ops_par = sgen_concurrent_collection_in_progress ()
+                                       ? &sgen_minor_collector.parallel_ops_with_concurrent_major
+                                       : &sgen_minor_collector.parallel_ops;
+               is_parallel = TRUE;
        }
 
        if (do_verify_nursery || do_dump_nursery_content)
@@ -1696,7 +1736,7 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        sgen_memgov_minor_collection_start ();
 
-       init_gray_queue (&gc_thread_gray_queue, is_parallel);
+       init_gray_queue (&gc_thread_gray_queue);
        ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, &gc_thread_gray_queue);
 
        gc_stats.minor_gc_count ++;
@@ -1705,6 +1745,8 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        /* pin from pinned handles */
        sgen_init_pinning ();
+       if (concurrent_collection_in_progress)
+               sgen_init_pinning_for_conc ();
        sgen_client_binary_protocol_mark_start (GENERATION_NURSERY);
        pin_from_roots (nursery_section->data, nursery_section->end_data, ctx);
        /* pin cemented objects */
@@ -1715,6 +1757,8 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        pin_objects_in_nursery (FALSE, ctx);
        sgen_pinning_trim_queue_to_section (nursery_section);
+       if (concurrent_collection_in_progress)
+               sgen_finish_pinning_for_conc ();
 
        if (remset_consistency_checks)
                sgen_check_remset_consistency ();
@@ -1750,8 +1794,8 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        if (is_parallel) {
                gray_queue_redirect (&gc_thread_gray_queue);
-               sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
-               sgen_workers_join ();
+               sgen_workers_start_all_workers (GENERATION_NURSERY, object_ops_nopar, object_ops_par, NULL);
+               sgen_workers_join (GENERATION_NURSERY);
        }
 
        TV_GETTIME (btv);
@@ -1839,6 +1883,14 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        current_collection_generation = -1;
        objects_pinned = 0;
 
+       if (is_parallel)
+               binary_protocol_collection_end_stats (0, 0, time_minor_finish_gray_stack - finish_gray_start);
+       else
+               binary_protocol_collection_end_stats (
+                       time_minor_scan_major_blocks - major_scan_start,
+                       time_minor_scan_los - los_scan_start,
+                       time_minor_finish_gray_stack - finish_gray_start);
+
        binary_protocol_collection_end (gc_stats.minor_gc_count - 1, GENERATION_NURSERY, 0, 0);
 
        if (check_nursery_objects_pinned && !sgen_minor_collector.is_split)
@@ -1904,6 +1956,8 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
 
        TV_GETTIME (atv);
        sgen_init_pinning ();
+       if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT)
+               sgen_init_pinning_for_conc ();
        SGEN_LOG (6, "Collecting pinned addresses");
        pin_from_roots ((void*)lowest_heap_address, (void*)highest_heap_address, ctx);
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
@@ -1974,20 +2028,23 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
        SGEN_LOG (2, "Finding pinned pointers: %zd in %lld usecs", sgen_get_pinned_count (), (long long)TV_ELAPSED (atv, btv));
        SGEN_LOG (4, "Start scan with %zd pinned objects", sgen_get_pinned_count ());
 
+       if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT)
+               sgen_finish_pinning_for_conc ();
+
        major_collector.init_to_space ();
 
        SGEN_ASSERT (0, sgen_workers_all_done (), "Why are the workers not done when we start or finish a major collection?");
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
                if (object_ops_par != NULL)
-                       sgen_workers_set_num_active_workers (0);
-               if (sgen_workers_have_idle_work ()) {
+                       sgen_workers_set_num_active_workers (GENERATION_OLD, 0);
+               if (sgen_workers_have_idle_work (GENERATION_OLD)) {
                        /*
                         * We force the finish of the worker with the new object ops context
                         * which can also do copying. We need to have finished pinning.
                         */
-                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+                       sgen_workers_start_all_workers (GENERATION_OLD, object_ops_nopar, object_ops_par, NULL);
 
-                       sgen_workers_join ();
+                       sgen_workers_join (GENERATION_OLD);
                }
        }
 
@@ -2013,17 +2070,17 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
         * the roots.
         */
        if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT) {
-               sgen_workers_set_num_active_workers (1);
+               sgen_workers_set_num_active_workers (GENERATION_OLD, 1);
                gray_queue_redirect (gc_thread_gray_queue);
                if (precleaning_enabled) {
-                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, workers_finish_callback);
+                       sgen_workers_start_all_workers (GENERATION_OLD, object_ops_nopar, object_ops_par, workers_finish_callback);
                } else {
-                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+                       sgen_workers_start_all_workers (GENERATION_OLD, object_ops_nopar, object_ops_par, NULL);
                }
        }
 
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
-               int i, split_count = sgen_workers_get_job_split_count ();
+               int i, split_count = sgen_workers_get_job_split_count (GENERATION_OLD);
                gboolean parallel = object_ops_par != NULL;
 
                /* If we're not parallel we finish the collection on the gc thread */
@@ -2038,13 +2095,15 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                        psj->scan_job.ops = parallel ? NULL : object_ops_nopar;
                        psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                        psj->job_index = i;
-                       sgen_workers_enqueue_job (&psj->scan_job.job, parallel);
+                       psj->job_split_count = split_count;
+                       sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, parallel);
 
                        psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob));
                        psj->scan_job.ops = parallel ? NULL : object_ops_nopar;
                        psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                        psj->job_index = i;
-                       sgen_workers_enqueue_job (&psj->scan_job.job, parallel);
+                       psj->job_split_count = split_count;
+                       sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, parallel);
                }
 
                if (parallel) {
@@ -2056,8 +2115,8 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                         * stack that contained roots and pinned objects and also scan the mod union card
                         * table.
                         */
-                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
-                       sgen_workers_join ();
+                       sgen_workers_start_all_workers (GENERATION_OLD, object_ops_nopar, object_ops_par, NULL);
+                       sgen_workers_join (GENERATION_OLD);
                }
        }
 
@@ -2082,7 +2141,7 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
 
        current_collection_generation = GENERATION_OLD;
 
-       sgen_workers_assert_gray_queue_is_empty ();
+       sgen_workers_assert_gray_queue_is_empty (GENERATION_OLD);
 
        if (!concurrent)
                sgen_cement_reset ();
@@ -2126,8 +2185,9 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        mword fragment_total;
        TV_DECLARE (atv);
        TV_DECLARE (btv);
-
-       TV_GETTIME (btv);
+       guint64 major_scan_start = time_major_scan_mod_union_blocks;
+       guint64 los_scan_start = time_major_scan_mod_union_los;
+       guint64 finish_gray_start = time_major_finish_gray_stack;
 
        if (concurrent_collection_in_progress) {
                SgenObjectOperations *object_ops_par = NULL;
@@ -2145,8 +2205,9 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
                object_ops_nopar = &major_collector.major_ops_serial;
        }
 
-       sgen_workers_assert_gray_queue_is_empty ();
+       sgen_workers_assert_gray_queue_is_empty (GENERATION_OLD);
 
+       TV_GETTIME (btv);
        finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue));
        TV_GETTIME (atv);
        time_major_finish_gray_stack += TV_ELAPSED (btv, atv);
@@ -2237,7 +2298,7 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        memset (&counts, 0, sizeof (ScannedObjectCounts));
        major_collector.finish_major_collection (&counts);
 
-       sgen_workers_assert_gray_queue_is_empty ();
+       sgen_workers_assert_gray_queue_is_empty (GENERATION_OLD);
 
        SGEN_ASSERT (0, sgen_workers_all_done (), "Can't have workers working after major collection has finished");
        if (concurrent_collection_in_progress)
@@ -2248,6 +2309,13 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        binary_protocol_flush_buffers (FALSE);
 
        //consistency_check ();
+       if (major_collector.is_parallel)
+                binary_protocol_collection_end_stats (0, 0, time_major_finish_gray_stack - finish_gray_start);
+        else
+                binary_protocol_collection_end_stats (
+                        time_major_scan_mod_union_blocks - major_scan_start,
+                        time_major_scan_mod_union_los - los_scan_start,
+                        time_major_finish_gray_stack - finish_gray_start);
 
        binary_protocol_collection_end (gc_stats.major_gc_count - 1, GENERATION_OLD, counts.num_scanned_objects, counts.num_unique_scanned_objects);
 }
@@ -2271,7 +2339,7 @@ major_do_collection (const char *reason, gboolean is_overflow, gboolean forced)
        /* world must be stopped already */
        TV_GETTIME (time_start);
 
-       init_gray_queue (&gc_thread_gray_queue, FALSE);
+       init_gray_queue (&gc_thread_gray_queue);
        major_start_collection (&gc_thread_gray_queue, reason, FALSE, &old_next_pin_slot);
        major_finish_collection (&gc_thread_gray_queue, reason, is_overflow, old_next_pin_slot, forced);
        sgen_gray_object_queue_dispose (&gc_thread_gray_queue);
@@ -2305,7 +2373,7 @@ major_start_concurrent_collection (const char *reason)
 
        binary_protocol_concurrent_start ();
 
-       init_gray_queue (&gc_thread_gray_queue, TRUE);
+       init_gray_queue (&gc_thread_gray_queue);
        // FIXME: store reason and pass it when finishing
        major_start_collection (&gc_thread_gray_queue, reason, TRUE, NULL);
        sgen_gray_object_queue_dispose (&gc_thread_gray_queue);
@@ -2360,7 +2428,7 @@ major_finish_concurrent_collection (gboolean forced)
         * The workers will be resumed with a finishing pause context to avoid
         * additional cardtable and object scanning.
         */
-       sgen_workers_stop_all_workers ();
+       sgen_workers_stop_all_workers (GENERATION_OLD);
 
        SGEN_TV_GETTIME (time_major_conc_collection_end);
        gc_stats.major_gc_time_concurrent += SGEN_TV_ELAPSED (time_major_conc_collection_start, time_major_conc_collection_end);
@@ -2373,7 +2441,7 @@ major_finish_concurrent_collection (gboolean forced)
 
        current_collection_generation = GENERATION_OLD;
        sgen_cement_reset ();
-       init_gray_queue (&gc_thread_gray_queue, FALSE);
+       init_gray_queue (&gc_thread_gray_queue);
        major_finish_collection (&gc_thread_gray_queue, "finishing", FALSE, -1, forced);
        sgen_gray_object_queue_dispose (&gc_thread_gray_queue);
 
@@ -2831,19 +2899,19 @@ sgen_get_current_collection_generation (void)
 }
 
 void*
-sgen_thread_register (SgenThreadInfo* info, void *stack_bottom_fallback)
+sgen_thread_attach (SgenThreadInfo* info)
 {
        info->tlab_start = info->tlab_next = info->tlab_temp_end = info->tlab_real_end = NULL;
 
-       sgen_client_thread_register (info, stack_bottom_fallback);
+       sgen_client_thread_attach (info);
 
        return info;
 }
 
 void
-sgen_thread_unregister (SgenThreadInfo *p)
+sgen_thread_detach_with_lock (SgenThreadInfo *p)
 {
-       sgen_client_thread_unregister (p);
+       sgen_client_thread_detach_with_lock (p);
 }
 
 /*
@@ -3159,9 +3227,8 @@ init_sgen_mode (SgenMode mode)
                /*
                 * Use concurrent major and dynamic nursery with a more
                 * aggressive shrinking relative to pause times.
-                * FIXME use parallel minors
                 */
-               minor = SGEN_MINOR_SIMPLE;
+               minor = SGEN_MINOR_SIMPLE_PARALLEL;
                major = SGEN_MAJOR_CONCURRENT;
                dynamic_nursery = TRUE;
                sgen_max_pause_margin = SGEN_PAUSE_MODE_MAX_PAUSE_MARGIN;
@@ -3594,19 +3661,7 @@ sgen_gc_init (void)
        if (major_collector.post_param_init)
                major_collector.post_param_init (&major_collector);
 
-       if (major_collector.is_concurrent || sgen_minor_collector.is_parallel) {
-               int num_workers = 1;
-               if (major_collector.is_parallel || sgen_minor_collector.is_parallel) {
-                       num_workers = mono_cpu_count ();
-                       if (num_workers <= 1) {
-                               num_workers = 1;
-                               major_collector.is_parallel = FALSE;
-                               sgen_minor_collector.is_parallel = FALSE;
-                       }
-               }
-               if (major_collector.is_concurrent || sgen_minor_collector.is_parallel)
-                       sgen_workers_init (num_workers, (SgenWorkerCallback) major_collector.worker_init_cb);
-       }
+       sgen_thread_pool_start ();
 
        sgen_memgov_init (max_heap, soft_limit, debug_print_allowance, allowance_ratio, save_target);