From 956ebf32abde1dcd62b06e02dc2d05dd6719a36e Mon Sep 17 00:00:00 2001 From: Vlad Brezae Date: Thu, 29 Sep 2016 02:16:06 +0300 Subject: [PATCH] [sgen] Switch to nopar context when we are left with one worker Non-parallel context is significantly faster since it doesn't require CAS-ing. Use it when we are left with only one worker so we don't regress performance. --- mono/sgen/sgen-gc.c | 46 +++++++++++++++++++--------------------- mono/sgen/sgen-workers.c | 25 +++++++++++++++++++--- mono/sgen/sgen-workers.h | 2 +- 3 files changed, 45 insertions(+), 28 deletions(-) diff --git a/mono/sgen/sgen-gc.c b/mono/sgen/sgen-gc.c index c689cd237b4..fa92d0c58b7 100644 --- a/mono/sgen/sgen-gc.c +++ b/mono/sgen/sgen-gc.c @@ -1737,7 +1737,7 @@ typedef enum { } CopyOrMarkFromRootsMode; static void -major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops, SgenObjectOperations *worker_object_ops) +major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par) { LOSObject *bigobj; TV_DECLARE (atv); @@ -1747,7 +1747,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ */ char *heap_start = NULL; char *heap_end = (char*)-1; - ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue); + ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue); gboolean concurrent = mode != COPY_OR_MARK_FROM_ROOTS_SERIAL; SGEN_ASSERT (0, !!concurrent == !!concurrent_collection_in_progress, "We've been called with the wrong mode."); @@ -1872,7 +1872,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ * We force the finish of the worker with the new object ops context * which can also do copying. We need to have finished pinning. */ - sgen_workers_start_all_workers (worker_object_ops, NULL); + sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL); sgen_workers_join (); } @@ -1889,7 +1889,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ sgen_client_collecting_major_3 (&fin_ready_queue, &critical_fin_queue); - enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops, FALSE); + enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops_nopar, FALSE); TV_GETTIME (btv); time_major_scan_roots += TV_ELAPSED (atv, btv); @@ -1902,9 +1902,9 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT) { gray_queue_redirect (gc_thread_gray_queue); if (precleaning_enabled) { - sgen_workers_start_all_workers (worker_object_ops, workers_finish_callback); + sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, workers_finish_callback); } else { - sgen_workers_start_all_workers (worker_object_ops, NULL); + sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL); } } @@ -1918,13 +1918,13 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ ParallelScanJob *psj; psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ParallelScanJob)); - psj->scan_job.ops = worker_object_ops; + psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar; psj->scan_job.gc_thread_gray_queue = NULL; psj->job_index = i; sgen_workers_enqueue_job (&psj->scan_job.job, TRUE); psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob)); - psj->scan_job.ops = worker_object_ops; + psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar; psj->scan_job.gc_thread_gray_queue = NULL; psj->job_index = i; sgen_workers_enqueue_job (&psj->scan_job.job, TRUE); @@ -1938,7 +1938,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ * stack that contained roots and pinned objects and also scan the mod union card * table. */ - sgen_workers_start_all_workers (worker_object_ops, NULL); + sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL); sgen_workers_join (); } @@ -1957,7 +1957,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_ static void major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean concurrent, size_t *old_next_pin_slot) { - SgenObjectOperations *object_ops; + SgenObjectOperations *object_ops_nopar, *object_ops_par = NULL; binary_protocol_collection_begin (gc_stats.major_gc_count, GENERATION_OLD); @@ -1972,13 +1972,12 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, g_assert (major_collector.is_concurrent); concurrent_collection_in_progress = TRUE; + object_ops_nopar = &major_collector.major_ops_concurrent_start; if (major_collector.is_parallel) - object_ops = &major_collector.major_ops_conc_par_start; - else - object_ops = &major_collector.major_ops_concurrent_start; + object_ops_par = &major_collector.major_ops_conc_par_start; } else { - object_ops = &major_collector.major_ops_serial; + object_ops_nopar = &major_collector.major_ops_serial; } reset_pinned_from_failed_allocation (); @@ -1997,14 +1996,14 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, if (major_collector.start_major_collection) major_collector.start_major_collection (); - major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops, object_ops); + major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops_nopar, object_ops_par); } static void major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean is_overflow, size_t old_next_pin_slot, gboolean forced) { ScannedObjectCounts counts; - SgenObjectOperations *object_ops; + SgenObjectOperations *object_ops_nopar; mword fragment_total; TV_DECLARE (atv); TV_DECLARE (btv); @@ -2012,25 +2011,24 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason TV_GETTIME (btv); if (concurrent_collection_in_progress) { - SgenObjectOperations *worker_object_ops; - object_ops = &major_collector.major_ops_concurrent_finish; + SgenObjectOperations *object_ops_par = NULL; + + object_ops_nopar = &major_collector.major_ops_concurrent_finish; if (major_collector.is_parallel) - worker_object_ops = &major_collector.major_ops_conc_par_finish; - else - worker_object_ops = object_ops; + object_ops_par = &major_collector.major_ops_conc_par_finish; - major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops, worker_object_ops); + major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops_nopar, object_ops_par); #ifdef SGEN_DEBUG_INTERNAL_ALLOC main_gc_thread = NULL; #endif } else { - object_ops = &major_collector.major_ops_serial; + object_ops_nopar = &major_collector.major_ops_serial; } sgen_workers_assert_gray_queue_is_empty (); - finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue)); + finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue)); TV_GETTIME (atv); time_major_finish_gray_stack += TV_ELAPSED (btv, atv); diff --git a/mono/sgen/sgen-workers.c b/mono/sgen/sgen-workers.c index 313ea91c7ef..adcbaf3d553 100644 --- a/mono/sgen/sgen-workers.c +++ b/mono/sgen/sgen-workers.c @@ -60,6 +60,7 @@ enum { typedef gint32 State; static SgenObjectOperations * volatile idle_func_object_ops; +static SgenObjectOperations *idle_func_object_ops_par, *idle_func_object_ops_nopar; /* * finished_callback is called only when the workers finish work normally (when they * are not forced to finish). The callback is used to enqueue preclean jobs. @@ -94,6 +95,14 @@ sgen_workers_ensure_awake (void) int i; gboolean need_signal = FALSE; + /* + * All workers are awaken, make sure we reset the parallel context. + * We call this function only when starting the workers so nobody is running, + * or when the last worker is enqueuing preclean work. In both cases we can't + * have a worker working using a nopar context, which means it is safe. + */ + idle_func_object_ops = (workers_num > 1) ? idle_func_object_ops_par : idle_func_object_ops_nopar; + for (i = 0; i < workers_num; i++) { State old_state; gboolean did_set_state; @@ -132,6 +141,7 @@ worker_try_finish (WorkerData *data) if (working == 1) { SgenWorkersFinishCallback callback = finish_callback; + SGEN_ASSERT (0, idle_func_object_ops == idle_func_object_ops_nopar, "Why are we finishing with parallel context"); /* We are the last one left. Enqueue preclean job if we have one and awake everybody */ SGEN_ASSERT (0, data->state != STATE_NOT_WORKING, "How did we get from doing idle work to NOT WORKING without setting it ourselves?"); if (callback) { @@ -153,6 +163,14 @@ worker_try_finish (WorkerData *data) SGEN_ASSERT (0, old_state == STATE_WORKING, "What other possibility is there?"); } while (!set_state (data, old_state, STATE_NOT_WORKING)); + /* + * If we are second to last to finish, we set the scan context to the non-parallel + * version so we can speed up the last worker. This helps us maintain same level + * of performance as non-parallel mode even if we fail to distribute work properly. + */ + if (working == 2) + idle_func_object_ops = idle_func_object_ops_nopar; + mono_os_mutex_unlock (&finished_lock); binary_protocol_worker_finish (sgen_timestamp (), forced_stop); @@ -333,10 +351,11 @@ sgen_workers_stop_all_workers (void) } void -sgen_workers_start_all_workers (SgenObjectOperations *object_ops, SgenWorkersFinishCallback callback) +sgen_workers_start_all_workers (SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par, SgenWorkersFinishCallback callback) { + idle_func_object_ops_par = object_ops_par; + idle_func_object_ops_nopar = object_ops_nopar; forced_stop = FALSE; - idle_func_object_ops = object_ops; finish_callback = callback; mono_memory_write_barrier (); @@ -422,7 +441,7 @@ sgen_workers_take_from_queue (SgenGrayQueue *queue) SgenObjectOperations* sgen_workers_get_idle_func_object_ops (void) { - return idle_func_object_ops; + return (idle_func_object_ops_par) ? idle_func_object_ops_par : idle_func_object_ops_nopar; } /* diff --git a/mono/sgen/sgen-workers.h b/mono/sgen/sgen-workers.h index 21da4207024..9ef4236a4de 100644 --- a/mono/sgen/sgen-workers.h +++ b/mono/sgen/sgen-workers.h @@ -30,7 +30,7 @@ typedef void (*SgenWorkerCallback) (WorkerData *data); void sgen_workers_init (int num_workers, SgenWorkerCallback callback); void sgen_workers_stop_all_workers (void); -void sgen_workers_start_all_workers (SgenObjectOperations *object_ops, SgenWorkersFinishCallback finish_job); +void sgen_workers_start_all_workers (SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par, SgenWorkersFinishCallback finish_job); void sgen_workers_init_distribute_gray_queue (void); void sgen_workers_enqueue_job (SgenThreadPoolJob *job, gboolean enqueue); void sgen_workers_distribute_gray_queue_sections (void); -- 2.25.1