[sgen] Parallel mod union scan jobs
[mono.git] / mono / sgen / sgen-gc.c
index 7b650e42caed2aca222b783dea5788135e495edf..c689cd237b4ce9b4b35451f85f701988346f86f2 100644 (file)
@@ -290,6 +290,8 @@ static SGEN_TV_DECLARE (time_major_conc_collection_end);
 
 int gc_debug_level = 0;
 FILE* gc_debug_file;
+static char* gc_params_options;
+static char* gc_debug_options;
 
 /*
 void
@@ -422,12 +424,11 @@ sgen_workers_get_job_gray_queue (WorkerData *worker_data, SgenGrayQueue *default
 }
 
 static void
-gray_queue_enable_redirect (SgenGrayQueue *queue)
+gray_queue_redirect (SgenGrayQueue *queue)
 {
        SGEN_ASSERT (0, concurrent_collection_in_progress, "Where are we redirecting the gray queue to, without a concurrent collection?");
 
-       sgen_gray_queue_set_alloc_prepare (queue, sgen_workers_take_from_queue_and_awake);
-       sgen_workers_take_from_queue_and_awake (queue);
+       sgen_workers_take_from_queue (queue);
 }
 
 void
@@ -990,6 +991,24 @@ mono_gc_get_logfile (void)
        return gc_debug_file;
 }
 
+void
+mono_gc_params_set (const char* options)
+{
+       if (gc_params_options)
+               g_free (gc_params_options);
+
+       gc_params_options = g_strdup (options);
+}
+
+void
+mono_gc_debug_set (const char* options)
+{
+       if (gc_debug_options)
+               g_free (gc_debug_options);
+
+       gc_debug_options = g_strdup (options);
+}
+
 static void
 scan_finalizer_entries (SgenPointerQueue *fin_queue, ScanCopyContext ctx)
 {
@@ -1169,7 +1188,6 @@ finish_gray_stack (int generation, ScanCopyContext ctx)
 
        g_assert (sgen_gray_object_queue_is_empty (queue));
 
-       sgen_gray_object_queue_trim_free_list (queue);
        binary_protocol_finish_gray_stack_end (sgen_timestamp (), generation);
 }
 
@@ -1305,6 +1323,11 @@ typedef struct {
        SgenGrayQueue *gc_thread_gray_queue;
 } ScanJob;
 
+typedef struct {
+       ScanJob scan_job;
+       int job_index;
+} ParallelScanJob;
+
 static ScanCopyContext
 scan_copy_context_for_scan_job (void *worker_data_untyped, ScanJob *job)
 {
@@ -1367,37 +1390,86 @@ job_scan_finalizer_entries (void *worker_data_untyped, SgenThreadPoolJob *job)
 static void
 job_scan_major_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
-       ScanJob *job_data = (ScanJob*)job;
-       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
 }
 
 static void
 job_scan_los_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
-       ScanJob *job_data = (ScanJob*)job;
-       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
 }
 
 static void
-job_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+job_major_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       g_assert (concurrent_collection_in_progress);
+
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+}
+
+static void
+job_los_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       g_assert (concurrent_collection_in_progress);
+
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+}
+
+static void
+job_scan_last_pinned (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
        ScanJob *job_data = (ScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
 
        g_assert (concurrent_collection_in_progress);
 
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx);
-
        sgen_scan_pin_queue_objects (ctx);
 }
 
+static void
+workers_finish_callback (void)
+{
+       ParallelScanJob *psj;
+       ScanJob *sj;
+       int split_count = sgen_workers_get_job_split_count ();
+       int i;
+       /* Mod union preclean jobs */
+       for (i = 0; i < split_count; i++) {
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean major mod union cardtable", job_major_mod_union_preclean, sizeof (ParallelScanJob));
+               psj->scan_job.ops = sgen_workers_get_idle_func_object_ops ();
+               psj->scan_job.gc_thread_gray_queue = NULL;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+       }
+
+       for (i = 0; i < split_count; i++) {
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean los mod union cardtable", job_los_mod_union_preclean, sizeof (ParallelScanJob));
+               psj->scan_job.ops = sgen_workers_get_idle_func_object_ops ();
+               psj->scan_job.gc_thread_gray_queue = NULL;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+       }
+
+       sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan last pinned", job_scan_last_pinned, sizeof (ScanJob));
+       sj->ops = sgen_workers_get_idle_func_object_ops ();
+       sj->gc_thread_gray_queue = NULL;
+       sgen_workers_enqueue_job (&sj->job, TRUE);
+}
+
 static void
 init_gray_queue (SgenGrayQueue *gc_thread_gray_queue, gboolean use_workers)
 {
@@ -1461,7 +1533,7 @@ enqueue_scan_from_roots_jobs (SgenGrayQueue *gc_thread_gray_queue, char *heap_st
  * Return whether any objects were late-pinned due to being out of memory.
  */
 static gboolean
-collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_queue, gboolean finish_up_concurrent_mark)
+collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_queue)
 {
        gboolean needs_major;
        size_t max_garbage_amount;
@@ -1529,11 +1601,6 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
 
        gc_stats.minor_gc_count ++;
 
-       if (whole_heap_check_before_collection) {
-               sgen_clear_nursery_fragments ();
-               sgen_check_whole_heap (finish_up_concurrent_mark);
-       }
-
        sgen_process_fin_stage_entries ();
 
        /* pin from pinned handles */
@@ -1552,6 +1619,11 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        if (remset_consistency_checks)
                sgen_check_remset_consistency ();
 
+       if (whole_heap_check_before_collection) {
+               sgen_clear_nursery_fragments ();
+               sgen_check_whole_heap (FALSE);
+       }
+
        TV_GETTIME (atv);
        time_minor_pinning += TV_ELAPSED (btv, atv);
        SGEN_LOG (2, "Finding pinned pointers: %zd in %lld usecs", sgen_get_pinned_count (), (long long)TV_ELAPSED (btv, atv));
@@ -1665,7 +1737,7 @@ typedef enum {
 } CopyOrMarkFromRootsMode;
 
 static void
-major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops)
+major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops, SgenObjectOperations *worker_object_ops)
 {
        LOSObject *bigobj;
        TV_DECLARE (atv);
@@ -1697,7 +1769,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
        sgen_clear_nursery_fragments ();
 
        if (whole_heap_check_before_collection)
-               sgen_check_whole_heap (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT);
+               sgen_check_whole_heap (TRUE);
 
        TV_GETTIME (btv);
        time_major_pre_collection_fragment_clear += TV_ELAPSED (atv, btv);
@@ -1786,13 +1858,6 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
        if (old_next_pin_slot)
                *old_next_pin_slot = sgen_get_pinned_count ();
 
-       /*
-        * We don't actually pin when starting a concurrent collection, so the remset
-        * consistency check won't work.
-        */
-       if (remset_consistency_checks && mode != COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT)
-               sgen_check_remset_consistency ();
-
        TV_GETTIME (btv);
        time_major_pinning += TV_ELAPSED (atv, btv);
        SGEN_LOG (2, "Finding pinned pointers: %zd in %lld usecs", sgen_get_pinned_count (), (long long)TV_ELAPSED (atv, btv));
@@ -1807,7 +1872,8 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                         * We force the finish of the worker with the new object ops context
                         * which can also do copying. We need to have finished pinning.
                         */
-                       sgen_workers_start_all_workers (object_ops, NULL);
+                       sgen_workers_start_all_workers (worker_object_ops, NULL);
+
                        sgen_workers_join ();
                }
        }
@@ -1834,35 +1900,46 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
         * the roots.
         */
        if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT) {
+               gray_queue_redirect (gc_thread_gray_queue);
                if (precleaning_enabled) {
-                       ScanJob *sj;
-                       /* Mod union preclean job */
-                       sj = (ScanJob*)sgen_thread_pool_job_alloc ("preclean mod union cardtable", job_mod_union_preclean, sizeof (ScanJob));
-                       sj->ops = object_ops;
-                       sj->gc_thread_gray_queue = NULL;
-                       sgen_workers_start_all_workers (object_ops, &sj->job);
+                       sgen_workers_start_all_workers (worker_object_ops, workers_finish_callback);
                } else {
-                       sgen_workers_start_all_workers (object_ops, NULL);
+                       sgen_workers_start_all_workers (worker_object_ops, NULL);
                }
-               gray_queue_enable_redirect (gc_thread_gray_queue);
        }
 
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
-               ScanJob *sj;
+               int i, split_count = sgen_workers_get_job_split_count ();
+
+               gray_queue_redirect (gc_thread_gray_queue);
 
                /* Mod union card table */
-               sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ScanJob));
-               sj->ops = object_ops;
-               sj->gc_thread_gray_queue = gc_thread_gray_queue;
-               sgen_workers_enqueue_job (&sj->job, FALSE);
-
-               sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ScanJob));
-               sj->ops = object_ops;
-               sj->gc_thread_gray_queue = gc_thread_gray_queue;
-               sgen_workers_enqueue_job (&sj->job, FALSE);
-
-               TV_GETTIME (atv);
-               time_major_scan_mod_union += TV_ELAPSED (btv, atv);
+               for (i = 0; i < split_count; i++) {
+                       ParallelScanJob *psj;
+
+                       psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ParallelScanJob));
+                       psj->scan_job.ops = worker_object_ops;
+                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->job_index = i;
+                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+
+                       psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob));
+                       psj->scan_job.ops = worker_object_ops;
+                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->job_index = i;
+                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+               }
+
+               /*
+                * If we enqueue a job while workers are running we need to sgen_workers_ensure_awake
+                * in order to make sure that we are running the idle func and draining all worker
+                * gray queues. The operation of starting workers implies this, so we start them after
+                * in order to avoid doing this operation twice. The workers will drain the main gray
+                * stack that contained roots and pinned objects and also scan the mod union card
+                * table.
+                */
+               sgen_workers_start_all_workers (worker_object_ops, NULL);
+               sgen_workers_join ();
        }
 
        sgen_pin_stats_report ();
@@ -1895,7 +1972,11 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
                g_assert (major_collector.is_concurrent);
                concurrent_collection_in_progress = TRUE;
 
-               object_ops = &major_collector.major_ops_concurrent_start;
+               if (major_collector.is_parallel)
+                       object_ops = &major_collector.major_ops_conc_par_start;
+               else
+                       object_ops = &major_collector.major_ops_concurrent_start;
+
        } else {
                object_ops = &major_collector.major_ops_serial;
        }
@@ -1916,7 +1997,7 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
        if (major_collector.start_major_collection)
                major_collector.start_major_collection ();
 
-       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops);
+       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops, object_ops);
 }
 
 static void
@@ -1931,9 +2012,14 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        TV_GETTIME (btv);
 
        if (concurrent_collection_in_progress) {
+               SgenObjectOperations *worker_object_ops;
                object_ops = &major_collector.major_ops_concurrent_finish;
+               if (major_collector.is_parallel)
+                       worker_object_ops = &major_collector.major_ops_conc_par_finish;
+               else
+                       worker_object_ops = object_ops;
 
-               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops);
+               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops, worker_object_ops);
 
 #ifdef SGEN_DEBUG_INTERNAL_ALLOC
                main_gc_thread = NULL;
@@ -1975,9 +2061,6 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        reset_heap_boundaries ();
        sgen_update_heap_boundaries ((mword)sgen_get_nursery_start (), (mword)sgen_get_nursery_end ());
 
-       if (whole_heap_check_before_collection)
-               sgen_check_whole_heap (FALSE);
-
        /* walk the pin_queue, build up the fragment list of free memory, unmark
         * pinned objects as we go, memzero() the empty fragments so they are ready for the
         * next allocations.
@@ -2257,7 +2340,7 @@ sgen_perform_collection (size_t requested_size, int generation_to_collect, const
                if (concurrent_collection_in_progress)
                        major_update_concurrent_collection ();
 
-               if (collect_nursery (reason, FALSE, NULL, FALSE) && !concurrent_collection_in_progress) {
+               if (collect_nursery (reason, FALSE, NULL) && !concurrent_collection_in_progress) {
                        overflow_generation_to_collect = GENERATION_OLD;
                        overflow_reason = "Minor overflow";
                }
@@ -2267,7 +2350,7 @@ sgen_perform_collection (size_t requested_size, int generation_to_collect, const
        } else {
                SGEN_ASSERT (0, generation_to_collect == GENERATION_OLD, "We should have handled nursery collections above");
                if (major_collector.is_concurrent && !wait_to_finish) {
-                       collect_nursery ("Concurrent start", FALSE, NULL, FALSE);
+                       collect_nursery ("Concurrent start", FALSE, NULL);
                        major_start_concurrent_collection (reason);
                        oldest_generation_collected = GENERATION_NURSERY;
                } else if (major_do_collection (reason, FALSE, wait_to_finish)) {
@@ -2285,7 +2368,7 @@ sgen_perform_collection (size_t requested_size, int generation_to_collect, const
                 */
 
                if (overflow_generation_to_collect == GENERATION_NURSERY)
-                       collect_nursery (overflow_reason, TRUE, NULL, FALSE);
+                       collect_nursery (overflow_reason, TRUE, NULL);
                else
                        major_do_collection (overflow_reason, TRUE, wait_to_finish);
 
@@ -2545,11 +2628,7 @@ sgen_get_current_collection_generation (void)
 void*
 sgen_thread_register (SgenThreadInfo* info, void *stack_bottom_fallback)
 {
-#ifndef HAVE_KW_THREAD
        info->tlab_start = info->tlab_next = info->tlab_temp_end = info->tlab_real_end = NULL;
-#endif
-
-       sgen_init_tlab_info (info);
 
        sgen_client_thread_register (info, stack_bottom_fallback);
 
@@ -2750,6 +2829,8 @@ sgen_gc_init (void)
        char **opts, **ptr;
        char *major_collector_opt = NULL;
        char *minor_collector_opt = NULL;
+       char *params_opts = NULL;
+       char *debug_opts = NULL;
        size_t max_heap = 0;
        size_t soft_limit = 0;
        int result;
@@ -2787,8 +2868,12 @@ sgen_gc_init (void)
 
        mono_coop_mutex_init (&sgen_interruption_mutex);
 
-       if ((env = g_getenv (MONO_GC_PARAMS_NAME))) {
-               opts = g_strsplit (env, ",", -1);
+       if ((env = g_getenv (MONO_GC_PARAMS_NAME)) || gc_params_options) {
+               params_opts = g_strdup_printf ("%s,%s", gc_params_options ? gc_params_options : "", env ? env : "");
+       }
+
+       if (params_opts) {
+               opts = g_strsplit (params_opts, ",", -1);
                for (ptr = opts; *ptr; ++ptr) {
                        char *opt = *ptr;
                        if (g_str_has_prefix (opt, "major=")) {
@@ -2839,6 +2924,8 @@ sgen_gc_init (void)
                sgen_marksweep_init (&major_collector);
        } else if (!strcmp (major_collector_opt, "marksweep-conc")) {
                sgen_marksweep_conc_init (&major_collector);
+       } else if (!strcmp (major_collector_opt, "marksweep-conc-par")) {
+               sgen_marksweep_conc_par_init (&major_collector);
        } else {
                sgen_env_var_error (MONO_GC_PARAMS_NAME, "Using `" DEFAULT_MAJOR_NAME "` instead.", "Unknown major collector `%s'.", major_collector_opt);
                goto use_default_major;
@@ -2990,15 +3077,22 @@ sgen_gc_init (void)
        if (minor_collector_opt)
                g_free (minor_collector_opt);
 
+       if (params_opts)
+               g_free (params_opts);
+
        alloc_nursery ();
 
        sgen_pinning_init ();
        sgen_cement_init (cement_enabled);
 
-       if ((env = g_getenv (MONO_GC_DEBUG_NAME))) {
+       if ((env = g_getenv (MONO_GC_DEBUG_NAME)) || gc_debug_options) {
+               debug_opts = g_strdup_printf ("%s,%s", gc_debug_options ? gc_debug_options  : "", env ? env : "");
+       }
+
+       if (debug_opts) {
                gboolean usage_printed = FALSE;
 
-               opts = g_strsplit (env, ",", -1);
+               opts = g_strsplit (debug_opts, ",", -1);
                for (ptr = opts; ptr && *ptr; ptr ++) {
                        char *opt = *ptr;
                        if (!strcmp (opt, ""))
@@ -3129,14 +3223,25 @@ sgen_gc_init (void)
                g_strfreev (opts);
        }
 
+       if (debug_opts)
+               g_free (debug_opts);
+
        if (check_mark_bits_after_major_collection)
                nursery_clear_policy = CLEAR_AT_GC;
 
        if (major_collector.post_param_init)
                major_collector.post_param_init (&major_collector);
 
-       if (major_collector.needs_thread_pool)
-               sgen_workers_init (1);
+       if (major_collector.needs_thread_pool) {
+               int num_workers = 1;
+               if (major_collector.is_parallel) {
+                       /* FIXME Detect the number of physical cores, instead of logical */
+                       num_workers = mono_cpu_count () / 2;
+                       if (num_workers < 1)
+                               num_workers = 1;
+               }
+               sgen_workers_init (num_workers, (SgenWorkerCallback) major_collector.worker_init_cb);
+       }
 
        sgen_memgov_init (max_heap, soft_limit, debug_print_allowance, allowance_ratio, save_target);
 
@@ -3263,7 +3368,7 @@ sgen_check_whole_heap_stw (void)
 {
        sgen_stop_world (0);
        sgen_clear_nursery_fragments ();
-       sgen_check_whole_heap (FALSE);
+       sgen_check_whole_heap (TRUE);
        sgen_restart_world (0);
 }