Merge pull request #5198 from BrzVlad/fix-binprot-stats
authorVlad Brezae <brezaevlad@gmail.com>
Thu, 20 Jul 2017 17:30:49 +0000 (20:30 +0300)
committerGitHub <noreply@github.com>
Thu, 20 Jul 2017 17:30:49 +0000 (20:30 +0300)
[sgen] Parallel gc improvements

mono/metadata/sgen-client-mono.h
mono/sgen/sgen-array-list.c
mono/sgen/sgen-array-list.h
mono/sgen/sgen-gc.c
mono/sgen/sgen-gc.h
mono/sgen/sgen-marksweep.c
mono/sgen/sgen-protocol-def.h
mono/sgen/sgen-thread-pool.c
mono/sgen/sgen-thread-pool.h
mono/sgen/sgen-workers.c
mono/sgen/sgen-workers.h

index e3cae00e7e605d20cd70630e0a534b4155ea0685..a5ec02d480e21cb4477a3cf7e5f1eb4416465d0a 100644 (file)
@@ -686,6 +686,16 @@ sgen_client_binary_protocol_pin_stats (int objects_pinned_in_nursery, size_t byt
 {
 }
 
+static void G_GNUC_UNUSED
+sgen_client_binary_protocol_worker_finish_stats (int worker_index, int generation, gboolean forced, long long major_scan, long long los_scan, long long work_time)
+{
+}
+
+static void G_GNUC_UNUSED
+sgen_client_binary_protocol_collection_end_stats (long long major_scan, long long los_scan, long long finish_stack)
+{
+}
+
 #define TLAB_ACCESS_INIT       SgenThreadInfo *__thread_info__ = (SgenThreadInfo*)mono_tls_get_sgen_thread_info ()
 #define IN_CRITICAL_REGION (__thread_info__->client_info.in_critical_region)
 
index 0247a6b12695c3e1368d327fefd5f58dd31fd2da..2fd54d986d0d6ded4b26dc3efb5203198bd4d3ec 100644 (file)
@@ -207,4 +207,27 @@ sgen_array_list_default_is_slot_set (volatile gpointer *slot)
        return *slot != NULL;
 }
 
+/* Removes all NULL pointers from the array. Not thread safe */
+void
+sgen_array_list_remove_nulls (SgenArrayList *array)
+{
+       guint32 start = 0;
+       volatile gpointer *slot;
+       gboolean skipped = FALSE;
+
+       SGEN_ARRAY_LIST_FOREACH_SLOT (array, slot) {
+               if (*slot) {
+                       *sgen_array_list_get_slot (array, start++) = *slot;
+                       if (skipped)
+                               *slot = NULL;
+               } else {
+                       skipped = TRUE;
+               }
+       } SGEN_ARRAY_LIST_END_FOREACH_SLOT;
+
+       mono_memory_write_barrier ();
+       array->next_slot = start;
+       array->slot_hint = start;
+}
+
 #endif
index 12e8bcdecf70f419c23c0cc65bcad7faac32e396..2ae3e63c2e2e5a3566af29a6dba2c369ee570807 100644 (file)
@@ -135,6 +135,6 @@ guint32 sgen_array_list_add (SgenArrayList *array, gpointer ptr, int data, gbool
 guint32 sgen_array_list_find (SgenArrayList *array, gpointer ptr);
 gboolean sgen_array_list_default_cas_setter (volatile gpointer *slot, gpointer ptr, int data);
 gboolean sgen_array_list_default_is_slot_set (volatile gpointer *slot);
-
+void sgen_array_list_remove_nulls (SgenArrayList *array);
 
 #endif
index 8f6ee8aa833b60c240bb421bea0236c118fd28ad..49c630f03f373ef59a4a37d6aafc96f29a157f0f 100644 (file)
@@ -282,7 +282,8 @@ static guint64 time_major_pre_collection_fragment_clear = 0;
 static guint64 time_major_pinning = 0;
 static guint64 time_major_scan_pinned = 0;
 static guint64 time_major_scan_roots = 0;
-static guint64 time_major_scan_mod_union = 0;
+static guint64 time_major_scan_mod_union_blocks = 0;
+static guint64 time_major_scan_mod_union_los = 0;
 static guint64 time_major_finish_gray_stack = 0;
 static guint64 time_major_free_bigobjs = 0;
 static guint64 time_major_los_sweep = 0;
@@ -1279,7 +1280,8 @@ init_stats (void)
        mono_counters_register ("Major pinning", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_pinning);
        mono_counters_register ("Major scan pinned", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_pinned);
        mono_counters_register ("Major scan roots", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_roots);
-       mono_counters_register ("Major scan mod union", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union);
+       mono_counters_register ("Major scan mod union blocks", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union_blocks);
+       mono_counters_register ("Major scan mod union los", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_scan_mod_union_los);
        mono_counters_register ("Major finish gray stack", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_finish_gray_stack);
        mono_counters_register ("Major free big objects", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_free_bigobjs);
        mono_counters_register ("Major LOS sweep", MONO_COUNTER_GC | MONO_COUNTER_ULONG | MONO_COUNTER_TIME, &time_major_los_sweep);
@@ -1361,6 +1363,7 @@ typedef struct {
 typedef struct {
        ScanJob scan_job;
        int job_index, job_split_count;
+       int data;
 } ParallelScanJob;
 
 static ScanCopyContext
@@ -1444,9 +1447,12 @@ job_scan_major_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        SGEN_TV_GETTIME (atv);
-       major_collector.scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, job_data->job_split_count);
+       major_collector.scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, job_data->job_split_count, job_data->data);
        SGEN_TV_GETTIME (btv);
        time_minor_scan_major_blocks += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
@@ -1461,48 +1467,79 @@ job_scan_los_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
        sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, job_data->job_index, job_data->job_split_count);
        SGEN_TV_GETTIME (btv);
        time_minor_scan_los += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_scan_major_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (atv);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, job_data->job_split_count, job_data->data);
+       SGEN_TV_GETTIME (btv);
+       time_major_scan_mod_union_blocks += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_scan_los_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
+       SGEN_TV_GETTIME (atv);
        sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
+       time_major_scan_mod_union_los += SGEN_TV_ELAPSED (atv, btv);
+
+       if (worker_data_untyped)
+               ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_major_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
+       SGEN_TV_GETTIME (atv);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, job_data->job_split_count, job_data->data);
+       SGEN_TV_GETTIME (btv);
 
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, job_data->job_split_count);
+       g_assert (worker_data_untyped);
+       ((WorkerData*)worker_data_untyped)->major_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
 job_los_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
 {
+       SGEN_TV_DECLARE (atv);
+       SGEN_TV_DECLARE (btv);
        ParallelScanJob *job_data = (ParallelScanJob*)job;
        ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
 
        g_assert (concurrent_collection_in_progress);
-
+       SGEN_TV_GETTIME (atv);
        sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, job_data->job_split_count);
+       SGEN_TV_GETTIME (btv);
+
+       g_assert (worker_data_untyped);
+       ((WorkerData*)worker_data_untyped)->los_scan_time += SGEN_TV_ELAPSED (atv, btv);
 }
 
 static void
@@ -1521,6 +1558,7 @@ workers_finish_callback (void)
 {
        ParallelScanJob *psj;
        ScanJob *sj;
+       size_t num_major_sections = major_collector.get_num_major_sections ();
        int split_count = sgen_workers_get_job_split_count (GENERATION_OLD);
        int i;
        /* Mod union preclean jobs */
@@ -1529,6 +1567,7 @@ workers_finish_callback (void)
                psj->scan_job.gc_thread_gray_queue = NULL;
                psj->job_index = i;
                psj->job_split_count = split_count;
+               psj->data = num_major_sections / split_count;
                sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, TRUE);
        }
 
@@ -1555,6 +1594,7 @@ static void
 enqueue_scan_remembered_set_jobs (SgenGrayQueue *gc_thread_gray_queue, SgenObjectOperations *ops, gboolean enqueue)
 {
        int i, split_count = sgen_workers_get_job_split_count (GENERATION_NURSERY);
+       size_t num_major_sections = major_collector.get_num_major_sections ();
        ScanJob *sj;
 
        sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan wbroots", job_scan_wbroots, sizeof (ScanJob));
@@ -1570,6 +1610,7 @@ enqueue_scan_remembered_set_jobs (SgenGrayQueue *gc_thread_gray_queue, SgenObjec
                psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                psj->job_index = i;
                psj->job_split_count = split_count;
+               psj->data = num_major_sections / split_count;
                sgen_workers_enqueue_job (GENERATION_NURSERY, &psj->scan_job.job, enqueue);
 
                psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS remsets", job_scan_los_card_table, sizeof (ParallelScanJob));
@@ -1650,6 +1691,9 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        TV_DECLARE (btv);
        SGEN_TV_DECLARE (last_minor_collection_start_tv);
        SGEN_TV_DECLARE (last_minor_collection_end_tv);
+       guint64 major_scan_start = time_minor_scan_major_blocks;
+       guint64 los_scan_start = time_minor_scan_los;
+       guint64 finish_gray_start = time_minor_finish_gray_stack;
 
        if (disable_minor_collections)
                return TRUE;
@@ -1844,6 +1888,14 @@ collect_nursery (const char *reason, gboolean is_overflow, SgenGrayQueue *unpin_
        current_collection_generation = -1;
        objects_pinned = 0;
 
+       if (is_parallel)
+               binary_protocol_collection_end_stats (0, 0, time_minor_finish_gray_stack - finish_gray_start);
+       else
+               binary_protocol_collection_end_stats (
+                       time_minor_scan_major_blocks - major_scan_start,
+                       time_minor_scan_los - los_scan_start,
+                       time_minor_finish_gray_stack - finish_gray_start);
+
        binary_protocol_collection_end (gc_stats.minor_gc_count - 1, GENERATION_NURSERY, 0, 0);
 
        if (check_nursery_objects_pinned && !sgen_minor_collector.is_split)
@@ -1990,10 +2042,13 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
                if (object_ops_par != NULL)
                        sgen_workers_set_num_active_workers (GENERATION_OLD, 0);
-               if (sgen_workers_have_idle_work (GENERATION_OLD)) {
+               if (object_ops_par == NULL && sgen_workers_have_idle_work (GENERATION_OLD)) {
                        /*
                         * We force the finish of the worker with the new object ops context
-                        * which can also do copying. We need to have finished pinning.
+                        * which can also do copying. We need to have finished pinning. On the
+                        * parallel collector, there is no need to drain the private queues
+                        * here, since we can do it as part of the finishing work, achieving
+                        * better work distribution.
                         */
                        sgen_workers_start_all_workers (GENERATION_OLD, object_ops_nopar, object_ops_par, NULL);
 
@@ -2034,6 +2089,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
 
        if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
                int i, split_count = sgen_workers_get_job_split_count (GENERATION_OLD);
+               size_t num_major_sections = major_collector.get_num_major_sections ();
                gboolean parallel = object_ops_par != NULL;
 
                /* If we're not parallel we finish the collection on the gc thread */
@@ -2049,6 +2105,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                        psj->scan_job.gc_thread_gray_queue = gc_thread_gray_queue;
                        psj->job_index = i;
                        psj->job_split_count = split_count;
+                       psj->data = num_major_sections / split_count;
                        sgen_workers_enqueue_job (GENERATION_OLD, &psj->scan_job.job, parallel);
 
                        psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob));
@@ -2138,8 +2195,9 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        mword fragment_total;
        TV_DECLARE (atv);
        TV_DECLARE (btv);
-
-       TV_GETTIME (btv);
+       guint64 major_scan_start = time_major_scan_mod_union_blocks;
+       guint64 los_scan_start = time_major_scan_mod_union_los;
+       guint64 finish_gray_start = time_major_finish_gray_stack;
 
        if (concurrent_collection_in_progress) {
                SgenObjectOperations *object_ops_par = NULL;
@@ -2159,6 +2217,7 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
 
        sgen_workers_assert_gray_queue_is_empty (GENERATION_OLD);
 
+       TV_GETTIME (btv);
        finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue));
        TV_GETTIME (atv);
        time_major_finish_gray_stack += TV_ELAPSED (btv, atv);
@@ -2260,6 +2319,13 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
        binary_protocol_flush_buffers (FALSE);
 
        //consistency_check ();
+       if (major_collector.is_parallel)
+                binary_protocol_collection_end_stats (0, 0, time_major_finish_gray_stack - finish_gray_start);
+        else
+                binary_protocol_collection_end_stats (
+                        time_major_scan_mod_union_blocks - major_scan_start,
+                        time_major_scan_mod_union_los - los_scan_start,
+                        time_major_finish_gray_stack - finish_gray_start);
 
        binary_protocol_collection_end (gc_stats.major_gc_count - 1, GENERATION_OLD, counts.num_scanned_objects, counts.num_unique_scanned_objects);
 }
@@ -3171,9 +3237,8 @@ init_sgen_mode (SgenMode mode)
                /*
                 * Use concurrent major and dynamic nursery with a more
                 * aggressive shrinking relative to pause times.
-                * FIXME use parallel minors
                 */
-               minor = SGEN_MINOR_SIMPLE;
+               minor = SGEN_MINOR_SIMPLE_PARALLEL;
                major = SGEN_MAJOR_CONCURRENT;
                dynamic_nursery = TRUE;
                sgen_max_pause_margin = SGEN_PAUSE_MODE_MAX_PAUSE_MARGIN;
index 2faad109d1ec06e3496ebd23f1ac4ab1d5d39de7..3e2620b219fdfca712a8afec98468c49b63945b7 100644 (file)
@@ -650,7 +650,7 @@ struct _SgenMajorCollector {
        void (*free_non_pinned_object) (GCObject *obj, size_t size);
        void (*pin_objects) (SgenGrayQueue *queue);
        void (*pin_major_object) (GCObject *obj, SgenGrayQueue *queue);
-       void (*scan_card_table) (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count);
+       void (*scan_card_table) (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count, int block_count);
        void (*iterate_live_block_ranges) (sgen_cardtable_block_callback callback);
        void (*iterate_block_ranges) (sgen_cardtable_block_callback callback);
        void (*update_cardtable_mod_union) (void);
index 04feb680f2d77b9f6b239c0046cf4abf165cb589..35e1195dbfbbfc27c96546a05684c22ad1b6fa3e 100644 (file)
@@ -212,6 +212,7 @@ static SgenArrayList allocated_blocks = SGEN_ARRAY_LIST_INIT (NULL, sgen_array_l
 /* non-allocated block free-list */
 static void *empty_blocks = NULL;
 static size_t num_empty_blocks = 0;
+static gboolean compact_blocks = FALSE;
 
 /*
  * We can iterate the block list also while sweep is in progress but we
@@ -234,6 +235,16 @@ static size_t num_empty_blocks = 0;
                (bl) = BLOCK_UNTAG ((bl));
 #define END_FOREACH_BLOCK_NO_LOCK      } SGEN_ARRAY_LIST_END_FOREACH_SLOT; }
 
+#define FOREACH_BLOCK_RANGE_HAS_REFERENCES_NO_LOCK(bl,begin,end,index,hr) {    \
+       volatile gpointer *slot;                                        \
+       SGEN_ARRAY_LIST_FOREACH_SLOT_RANGE (&allocated_blocks, begin, end, slot, index) { \
+               (bl) = (MSBlockInfo *) (*slot);                         \
+               if (!(bl))                                              \
+                       continue;                                       \
+               (hr) = BLOCK_IS_TAGGED_HAS_REFERENCES ((bl));           \
+               (bl) = BLOCK_UNTAG ((bl));
+#define END_FOREACH_BLOCK_RANGE_NO_LOCK        } SGEN_ARRAY_LIST_END_FOREACH_SLOT_RANGE; }
+
 static volatile size_t num_major_sections = 0;
 /*
  * One free block list for each block object size.  We add and remove blocks from these
@@ -1601,6 +1612,8 @@ sweep_start (void)
 
        sgen_workers_foreach (GENERATION_NURSERY, sgen_worker_clear_free_block_lists);
        sgen_workers_foreach (GENERATION_OLD, sgen_worker_clear_free_block_lists);
+
+       compact_blocks = TRUE;
 }
 
 static void sweep_finish (void);
@@ -1974,6 +1987,18 @@ major_start_nursery_collection (void)
 #endif
 
        old_num_major_sections = num_major_sections;
+
+       /* Compact the block list if it hasn't been compacted in a while and nobody is using it */
+       if (compact_blocks && !sweep_in_progress () && !sweep_blocks_job && !sgen_concurrent_collection_in_progress ()) {
+               /*
+                * We support null elements in the array but do regular compaction to avoid
+                * excessive traversal of the array and to facilitate splitting into well
+                * balanced sections for parallel modes. We compact as soon as possible after
+                * sweep.
+                */
+               sgen_array_list_remove_nulls (&allocated_blocks);
+               compact_blocks = FALSE;
+       }
 }
 
 static void
@@ -2603,10 +2628,24 @@ scan_card_table_for_block (MSBlockInfo *block, CardTableScanType scan_type, Scan
 }
 
 static void
-major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count)
+major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count, int block_count)
 {
        MSBlockInfo *block;
        gboolean has_references, was_sweeping, skip_scan;
+       int first_block, last_block, index;
+
+       /*
+        * The last_block's index is at least (num_major_sections - 1) since we
+        * can have nulls in the allocated_blocks list. The last worker will
+        * scan the left-overs of the list. We expect few null entries in the
+        * allocated_blocks list, therefore using num_major_sections for computing
+        * block_count shouldn't affect work distribution.
+        */
+       first_block = block_count * job_index;
+       if (job_index == job_split_count - 1)
+               last_block = allocated_blocks.next_slot;
+       else
+               last_block = block_count * (job_index + 1);
 
        if (!concurrent_mark)
                g_assert (scan_type == CARDTABLE_SCAN_GLOBAL);
@@ -2616,11 +2655,9 @@ major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job
        was_sweeping = sweep_in_progress ();
 
        binary_protocol_major_card_table_scan_start (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
-       FOREACH_BLOCK_HAS_REFERENCES_NO_LOCK (block, has_references) {
-               if (__index % job_split_count != job_index)
-                       continue;
+       FOREACH_BLOCK_RANGE_HAS_REFERENCES_NO_LOCK (block, first_block, last_block, index, has_references) {
 #ifdef PREFETCH_CARDS
-               int prefetch_index = __index + 6 * job_split_count;
+               int prefetch_index = index + 6;
                if (prefetch_index < allocated_blocks.next_slot) {
                        MSBlockInfo *prefetch_block = BLOCK_UNTAG (*sgen_array_list_get_slot (&allocated_blocks, prefetch_index));
                        PREFETCH_READ (prefetch_block);
@@ -2631,7 +2668,6 @@ major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job
                        }
                 }
 #endif
-
                if (!has_references)
                        continue;
                skip_scan = FALSE;
@@ -2655,16 +2691,16 @@ major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job
                                 * sweep start since we are in a nursery collection. Also avoid CAS-ing
                                 */
                                if (sweep_in_progress ()) {
-                                       skip_scan = !ensure_block_is_checked_for_sweeping (__index, TRUE, NULL);
+                                       skip_scan = !ensure_block_is_checked_for_sweeping (index, TRUE, NULL);
                                } else if (was_sweeping) {
                                        /* Recheck in case sweep finished after dereferencing the slot */
-                                       skip_scan = *sgen_array_list_get_slot (&allocated_blocks, __index) == 0;
+                                       skip_scan = *sgen_array_list_get_slot (&allocated_blocks, index) == 0;
                                }
                        }
                }
                if (!skip_scan)
                        scan_card_table_for_block (block, scan_type, ctx);
-       } END_FOREACH_BLOCK_NO_LOCK;
+       } END_FOREACH_BLOCK_RANGE_NO_LOCK;
        binary_protocol_major_card_table_scan_end (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
 }
 
index 08bfdfe890ee2d9ddf8a4e91f1f8158d66363a36..7406ab2c5b335e69e7e5973ca253803e5d17ba80 100644 (file)
@@ -380,47 +380,47 @@ MATCH_INDEX (ptr == entry->obj ? 0 : ptr == entry->ptr ? 1 : ptr == entry->value
 IS_VTABLE_MATCH (ptr == entry->value_vtable)
 END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_major_card_table_scan_start, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_major_card_table_scan_start, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_major_card_table_scan_end, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_major_card_table_scan_end, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_los_card_table_scan_start, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_los_card_table_scan_start, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_los_card_table_scan_end, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_los_card_table_scan_end, TYPE_LONGLONG, timestamp, TYPE_BOOL, mod_union)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_finish_gray_stack_start, TYPE_LONGLONG, timestamp, TYPE_INT, generation)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_finish_gray_stack_start, TYPE_LONGLONG, timestamp, TYPE_INT, generation)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
-BEGIN_PROTOCOL_ENTRY2 (binary_protocol_finish_gray_stack_end, TYPE_LONGLONG, timestamp, TYPE_INT, generation)
+BEGIN_PROTOCOL_ENTRY_HEAVY2 (binary_protocol_finish_gray_stack_end, TYPE_LONGLONG, timestamp, TYPE_INT, generation)
 DEFAULT_PRINT ()
 IS_ALWAYS_MATCH (TRUE)
 MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
-END_PROTOCOL_ENTRY
+END_PROTOCOL_ENTRY_HEAVY
 
 BEGIN_PROTOCOL_ENTRY2 (binary_protocol_worker_finish, TYPE_LONGLONG, timestamp, TYPE_BOOL, forced)
 DEFAULT_PRINT ()
@@ -457,6 +457,20 @@ MATCH_INDEX (BINARY_PROTOCOL_MATCH)
 IS_VTABLE_MATCH (FALSE)
 END_PROTOCOL_ENTRY
 
+BEGIN_PROTOCOL_ENTRY6 (binary_protocol_worker_finish_stats, TYPE_INT, worker_index, TYPE_INT, generation, TYPE_BOOL, forced, TYPE_LONGLONG, major_scan, TYPE_LONGLONG, los_scan, TYPE_LONGLONG, work_time)
+DEFAULT_PRINT ()
+IS_ALWAYS_MATCH (TRUE)
+MATCH_INDEX (BINARY_PROTOCOL_MATCH)
+IS_VTABLE_MATCH (FALSE)
+END_PROTOCOL_ENTRY
+
+BEGIN_PROTOCOL_ENTRY3 (binary_protocol_collection_end_stats, TYPE_LONGLONG, major_scan, TYPE_LONGLONG, los_scan, TYPE_LONGLONG, finish_stack)
+DEFAULT_PRINT ()
+IS_ALWAYS_MATCH (TRUE)
+MATCH_INDEX (BINARY_PROTOCOL_MATCH)
+IS_VTABLE_MATCH (FALSE)
+END_PROTOCOL_ENTRY
+
 #undef BEGIN_PROTOCOL_ENTRY0
 #undef BEGIN_PROTOCOL_ENTRY1
 #undef BEGIN_PROTOCOL_ENTRY2
index 6a03bc3d69d164b5cd949b0b71760fc84f41d04b..211bc8b7277975471954dec78c7775c9a8e8c9f6 100644 (file)
@@ -21,6 +21,7 @@ static mono_cond_t done_cond;
 
 static int threads_num;
 static MonoNativeThreadId threads [SGEN_THREADPOOL_MAX_NUM_THREADS];
+static int threads_context [SGEN_THREADPOOL_MAX_NUM_THREADS];
 
 static volatile gboolean threadpool_shutdown;
 static volatile int threads_finished;
@@ -166,8 +167,9 @@ get_work (int worker_index, int *work_context, int *do_idle, SgenThreadPoolJob *
 }
 
 static mono_native_thread_return_t
-thread_func (int worker_index)
+thread_func (void *data)
 {
+       int worker_index = (int)(gsize)data;
        int current_context;
        void *thread_data = NULL;
 
@@ -190,7 +192,9 @@ thread_func (int worker_index)
                SgenThreadPoolJob *job = NULL;
                SgenThreadPoolContext *context = NULL;
 
+               threads_context [worker_index] = -1;
                get_work (worker_index, &current_context, &do_idle, &job);
+               threads_context [worker_index] = current_context;
 
                if (!threadpool_shutdown) {
                        context = &pool_contexts [current_context];
@@ -358,13 +362,13 @@ sgen_thread_pool_idle_signal (int context_id)
 }
 
 void
-sgen_thread_pool_idle_wait (int context_id)
+sgen_thread_pool_idle_wait (int context_id, SgenThreadPoolContinueIdleWaitFunc continue_wait)
 {
        SGEN_ASSERT (0, pool_contexts [context_id].idle_job_func, "Why are we waiting for idle without an idle function?");
 
        mono_os_mutex_lock (&lock);
 
-       while (pool_contexts [context_id].continue_idle_job_func (NULL, context_id))
+       while (continue_wait (context_id, threads_context))
                mono_os_cond_wait (&done_cond, &lock);
 
        mono_os_mutex_unlock (&lock);
index b13848a5a80d20ef938cde11c3ce8371b4a6cec6..b62e85fe35ba6e552814e9d68b62a7a4000e31f6 100644 (file)
@@ -24,6 +24,7 @@ typedef void (*SgenThreadPoolThreadInitFunc) (void*);
 typedef void (*SgenThreadPoolIdleJobFunc) (void*);
 typedef gboolean (*SgenThreadPoolContinueIdleJobFunc) (void*, int);
 typedef gboolean (*SgenThreadPoolShouldWorkFunc) (void*);
+typedef gboolean (*SgenThreadPoolContinueIdleWaitFunc) (int, int*);
 
 struct _SgenThreadPoolJob {
        const char *name;
@@ -60,7 +61,7 @@ void sgen_thread_pool_job_enqueue (int context_id, SgenThreadPoolJob *job);
 void sgen_thread_pool_job_wait (int context_id, SgenThreadPoolJob *job);
 
 void sgen_thread_pool_idle_signal (int context_id);
-void sgen_thread_pool_idle_wait (int context_id);
+void sgen_thread_pool_idle_wait (int context_id, SgenThreadPoolContinueIdleWaitFunc continue_wait);
 
 void sgen_thread_pool_wait_for_all_jobs (int context_id);
 
index 8a615ede6388b42fd1e4b90f50347264e64bba94..ccb9f9c5f27e9678ebc1078607b5c863e527517f 100644 (file)
@@ -56,8 +56,6 @@ set_state (WorkerData *data, State old_state, State new_state)
                SGEN_ASSERT (0, old_state == STATE_WORKING, "We can only transition to NOT WORKING from WORKING");
        else if (new_state == STATE_WORKING)
                SGEN_ASSERT (0, old_state == STATE_WORK_ENQUEUED, "We can only transition to WORKING from WORK ENQUEUED");
-       if (new_state == STATE_NOT_WORKING || new_state == STATE_WORKING)
-               SGEN_ASSERT (6, sgen_thread_pool_is_thread_pool_thread (mono_native_thread_id_get ()), "Only the worker thread is allowed to transition to NOT_WORKING or WORKING");
 
        return InterlockedCompareExchange (&data->state, new_state, old_state) == old_state;
 }
@@ -94,6 +92,9 @@ sgen_workers_ensure_awake (WorkerContext *context)
                                break;
 
                        did_set_state = set_state (&context->workers_data [i], old_state, STATE_WORK_ENQUEUED);
+
+                       if (did_set_state && old_state == STATE_NOT_WORKING)
+                               context->workers_data [i].last_start = sgen_timestamp ();
                } while (!did_set_state);
 
                if (!state_is_working_or_enqueued (old_state))
@@ -110,6 +111,7 @@ worker_try_finish (WorkerData *data)
        State old_state;
        int i, working = 0;
        WorkerContext *context = data->context;
+       gint64 last_start = data->last_start;
 
        ++stat_workers_num_finished;
 
@@ -132,6 +134,12 @@ worker_try_finish (WorkerData *data)
                        /* Make sure each worker has a chance of seeing the enqueued jobs */
                        sgen_workers_ensure_awake (context);
                        SGEN_ASSERT (0, data->state == STATE_WORK_ENQUEUED, "Why did we fail to set our own state to ENQUEUED");
+
+                       /*
+                        * Log to be able to get the duration of normal concurrent M&S phase.
+                        * Worker indexes are 1 based, since 0 is logically considered gc thread.
+                        */
+                       binary_protocol_worker_finish_stats (data - &context->workers_data [0] + 1, context->generation, context->forced_stop, data->major_scan_time, data->los_scan_time, data->total_time + sgen_timestamp () - last_start);
                        goto work_available;
                }
        }
@@ -156,7 +164,8 @@ worker_try_finish (WorkerData *data)
        context->workers_finished = TRUE;
        mono_os_mutex_unlock (&context->finished_lock);
 
-       binary_protocol_worker_finish (sgen_timestamp (), context->forced_stop);
+       data->total_time += (sgen_timestamp () - last_start);
+       binary_protocol_worker_finish_stats (data - &context->workers_data [0] + 1, context->generation, context->forced_stop, data->major_scan_time, data->los_scan_time, data->total_time);
 
        sgen_gray_object_queue_trim_free_list (&data->private_gray_queue);
        return;
@@ -378,17 +387,63 @@ sgen_workers_create_context (int generation, int num_workers)
        }
 }
 
+/* This is called with thread pool lock so no context switch can happen */
+static gboolean
+continue_idle_wait (int calling_context, int *threads_context)
+{
+       WorkerContext *context;
+       int i;
+
+       if (worker_contexts [GENERATION_OLD].workers_num && calling_context == worker_contexts [GENERATION_OLD].thread_pool_context)
+               context = &worker_contexts [GENERATION_OLD];
+       else if (worker_contexts [GENERATION_NURSERY].workers_num && calling_context == worker_contexts [GENERATION_NURSERY].thread_pool_context)
+               context = &worker_contexts [GENERATION_NURSERY];
+       else
+               g_assert_not_reached ();
+
+       /*
+        * We assume there are no pending jobs, since this is called only after
+        * we waited for all the jobs.
+        */
+       for (i = 0; i < context->active_workers_num; i++) {
+               if (threads_context [i] == calling_context)
+                       return TRUE;
+       }
+
+       if (sgen_workers_have_idle_work (context->generation) && !context->forced_stop)
+               return TRUE;
+
+       /*
+        * At this point there are no jobs to be done, and no objects to be scanned
+        * in the gray queues. We can simply asynchronously finish all the workers
+        * from the context that were not finished already (due to being stuck working
+        * in another context)
+        */
+
+       for (i = 0; i < context->active_workers_num; i++) {
+               if (context->workers_data [i].state == STATE_WORK_ENQUEUED)
+                       set_state (&context->workers_data [i], STATE_WORK_ENQUEUED, STATE_WORKING);
+               if (context->workers_data [i].state == STATE_WORKING)
+                       worker_try_finish (&context->workers_data [i]);
+       }
+
+       return FALSE;
+}
+
+
 void
 sgen_workers_stop_all_workers (int generation)
 {
        WorkerContext *context = &worker_contexts [generation];
 
+       mono_os_mutex_lock (&context->finished_lock);
        context->finish_callback = NULL;
-       mono_memory_write_barrier ();
+       mono_os_mutex_unlock (&context->finished_lock);
+
        context->forced_stop = TRUE;
 
        sgen_thread_pool_wait_for_all_jobs (context->thread_pool_context);
-       sgen_thread_pool_idle_wait (context->thread_pool_context);
+       sgen_thread_pool_idle_wait (context->thread_pool_context, continue_idle_wait);
        SGEN_ASSERT (0, !sgen_workers_are_working (context), "Can only signal enqueue work when in no work state");
 
        context->started = FALSE;
@@ -410,6 +465,7 @@ void
 sgen_workers_start_all_workers (int generation, SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par, SgenWorkersFinishCallback callback)
 {
        WorkerContext *context = &worker_contexts [generation];
+       int i;
        SGEN_ASSERT (0, !context->started, "Why are we starting to work without finishing previous cycle");
 
        context->idle_func_object_ops_par = object_ops_par;
@@ -418,6 +474,13 @@ sgen_workers_start_all_workers (int generation, SgenObjectOperations *object_ops
        context->finish_callback = callback;
        context->worker_awakenings = 0;
        context->started = TRUE;
+
+       for (i = 0; i < context->active_workers_num; i++) {
+               context->workers_data [i].major_scan_time = 0;
+               context->workers_data [i].los_scan_time = 0;
+               context->workers_data [i].total_time = 0;
+               context->workers_data [i].last_start = 0;
+       }
        mono_memory_write_barrier ();
 
        /*
@@ -435,16 +498,10 @@ sgen_workers_join (int generation)
        WorkerContext *context = &worker_contexts [generation];
        int i;
 
-       /*
-        * It might be the case that a worker didn't get to run anything
-        * in this context, because it was stuck working on a long job
-        * in another context. In this case its state is active (WORK_ENQUEUED)
-        * and we need to wait for it to finish itself.
-        * FIXME Avoid having to wait for the worker to report its own finish.
-        */
+       SGEN_ASSERT (0, !context->finish_callback, "Why are we joining concurrent mark early");
 
        sgen_thread_pool_wait_for_all_jobs (context->thread_pool_context);
-       sgen_thread_pool_idle_wait (context->thread_pool_context);
+       sgen_thread_pool_idle_wait (context->thread_pool_context, continue_idle_wait);
        SGEN_ASSERT (0, !sgen_workers_are_working (context), "Can only signal enqueue work when in no work state");
 
        /* At this point all the workers have stopped. */
@@ -457,8 +514,8 @@ sgen_workers_join (int generation)
 }
 
 /*
- * Can only be called if the workers are stopped.
- * If we're stopped, there are also no pending jobs.
+ * Can only be called if the workers are not working in the
+ * context and there are no pending jobs.
  */
 gboolean
 sgen_workers_have_idle_work (int generation)
@@ -466,8 +523,6 @@ sgen_workers_have_idle_work (int generation)
        WorkerContext *context = &worker_contexts [generation];
        int i;
 
-       SGEN_ASSERT (0, context->forced_stop && !sgen_workers_are_working (context), "Checking for idle work should only happen if the workers are stopped.");
-
        if (!sgen_section_gray_queue_is_empty (&context->workers_distribute_gray_queue))
                return TRUE;
 
index 42dc16ac4aef7c6eb18cd5fe0b73dd5f49b6497e..e3147f051af78a87304ce3ffb150f0daeb7f2b74 100644 (file)
@@ -32,6 +32,15 @@ struct _WorkerData {
         */
        gpointer free_block_lists;
        WorkerContext *context;
+
+       /* Work time distribution. Measured in ticks. */
+       gint64 major_scan_time, los_scan_time, total_time;
+       /*
+        * When changing the state of the worker from not working to work enqueued
+        * we set the timestamp so we can compute for how long the worker did actual
+        * work during the phase
+        */
+       gint64 last_start;
 };
 
 struct _WorkerContext {