Merge pull request #3806 from BrzVlad/feature-parallel-gc-final

author Vlad Brezae <brezaevlad@gmail.com>

Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)

committer GitHub <noreply@github.com>

Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)
author Vlad Brezae <brezaevlad@gmail.com>
Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)
committer GitHub <noreply@github.com>
Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)
diff --git a/mono/metadata/sgen-mono.c b/mono/metadata/sgen-mono.c

index a76e2e6306ea0726ce0c66885d090bcb6d8b742e..7d6a95aca5ec4fd8ac2e5ed555cdfc3b885edefd 100644 (file)
--- a/mono/metadata/sgen-mono.c
+++ b/mono/metadata/sgen-mono.c
@@ -1631,7 +1631,7 @@ find_next_card (guint8 *card_data, guint8 *end)
  #define ARRAY_OBJ_INDEX(ptr,array,elem_size) (((char*)(ptr) - ((char*)(array) + G_STRUCT_OFFSET (MonoArray, vector))) / (elem_size))
  
  gboolean
-sgen_client_cardtable_scan_object (GCObject *obj, mword block_obj_size, guint8 *cards, ScanCopyContext ctx)
+sgen_client_cardtable_scan_object (GCObject *obj, guint8 *cards, ScanCopyContext ctx)
  {
         MonoVTable *vt = SGEN_LOAD_VTABLE (obj);
         MonoClass *klass = vt->klass;
diff --git a/mono/profiler/Makefile.am b/mono/profiler/Makefile.am

index 2e24af424a608b37d7626c5ebf7934316a427484..675f0ffcf46ed2880eafcceefe2e4c8eb97e4ffe 100644 (file)
--- a/mono/profiler/Makefile.am
+++ b/mono/profiler/Makefile.am
@@ -86,6 +86,8 @@ libmono_profiler_vtune_la_LIBADD = $(VTUNE_LIBS) $(LIBMONO) $(GLIB_LIBS) $(LIBIC
  libmono_profiler_vtune_la_LDFLAGS = $(prof_ldflags)
  libmono_profiler_vtune_static_la_SOURCES = mono-profiler-vtune.c
  libmono_profiler_vtune_static_la_LDFLAGS = -static
+libmono_profiler_vtune_static_la_CFLAGS = $(VTUNE_CFLAGS)
+libmono_profiler_vtune_static_la_LIBADD = $(VTUNE_LIBS)
  endif
  
  mprof_report_SOURCES = mprof-report.c
diff --git a/mono/sgen/sgen-array-list.c b/mono/sgen/sgen-array-list.c

index 09ac01dfd5e8ee88f365d618b551d1b9a84c5be4..4150e4b824e6aad234bb88b1a74a84a33cdd4a13 100644 (file)
--- a/mono/sgen/sgen-array-list.c
+++ b/mono/sgen/sgen-array-list.c
@@ -176,24 +176,6 @@ retry:
         return index;
  }
  
-/*
- * Removes all NULL pointers from the array. Not thread safe
- */
-void
-sgen_array_list_remove_nulls (SgenArrayList *array)
-{
-       guint32 start = 0;
-       volatile gpointer *slot;
-
-       SGEN_ARRAY_LIST_FOREACH_SLOT (array, slot) {
-               if (*slot)
-                       *sgen_array_list_get_slot (array, start++) = *slot;
-       } SGEN_ARRAY_LIST_END_FOREACH_SLOT;
-
-       mono_memory_write_barrier ();
-       array->next_slot = start;
-}
-
  /*
   * Does a linear search through the pointer array to find `ptr`.  Returns the index if
   * found, otherwise (guint32)-1.
@@ -210,4 +192,18 @@ sgen_array_list_find (SgenArrayList *array, gpointer ptr)
         return (guint32)-1;
  }
  
+gboolean
+sgen_array_list_default_cas_setter (volatile gpointer *slot, gpointer ptr, int data)
+{
+       if (InterlockedCompareExchangePointer (slot, ptr, NULL) == NULL)
+               return TRUE;
+       return FALSE;
+}
+
+gboolean
+sgen_array_list_default_is_slot_set (volatile gpointer *slot)
+{
+       return *slot != NULL;
+}
+
  #endif
diff --git a/mono/sgen/sgen-array-list.h b/mono/sgen/sgen-array-list.h

index 5b9291693363250e7ab4916b887d6849d5397b65..c505833ae01c727e5b3a8e2dce5b77d1db71148b 100644 (file)
--- a/mono/sgen/sgen-array-list.h
+++ b/mono/sgen/sgen-array-list.h
@@ -132,6 +132,8 @@ sgen_array_list_get_slot (SgenArrayList *array, guint32 index)
  guint32 sgen_array_list_alloc_block (SgenArrayList *array, guint32 slots_to_add);
  guint32 sgen_array_list_add (SgenArrayList *array, gpointer ptr, int data, gboolean increase_size_before_set);
  guint32 sgen_array_list_find (SgenArrayList *array, gpointer ptr);
-void sgen_array_list_remove_nulls (SgenArrayList *array);
+gboolean sgen_array_list_default_cas_setter (volatile gpointer *slot, gpointer ptr, int data);
+gboolean sgen_array_list_default_is_slot_set (volatile gpointer *slot);
+
  
  #endif
diff --git a/mono/sgen/sgen-cardtable.c b/mono/sgen/sgen-cardtable.c

index af9fd1e33d525a20df807957f744911e64deb73a..cf89aedad4f5f7e10c46216141c0bc81ae4d8a9a 100644 (file)
--- a/mono/sgen/sgen-cardtable.c
+++ b/mono/sgen/sgen-cardtable.c
@@ -438,11 +438,11 @@ sgen_card_table_scan_remsets (ScanCopyContext ctx)
         sgen_card_table_clear_cards ();
  #endif
         SGEN_TV_GETTIME (atv);
-       sgen_get_major_collector ()->scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx);
+       sgen_get_major_collector ()->scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, 0, 1);
         SGEN_TV_GETTIME (btv);
         last_major_scan_time = SGEN_TV_ELAPSED (atv, btv); 
         major_card_scan_time += last_major_scan_time;
-       sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_GLOBAL, ctx, 0, 1);
         SGEN_TV_GETTIME (atv);
         last_los_scan_time = SGEN_TV_ELAPSED (btv, atv);
         los_card_scan_time += last_los_scan_time;
@@ -493,7 +493,7 @@ sgen_cardtable_scan_object (GCObject *obj, mword block_obj_size, guint8 *cards,
  {
         HEAVY_STAT (++large_objects);
  
-       if (sgen_client_cardtable_scan_object (obj, block_obj_size, cards, ctx))
+       if (sgen_client_cardtable_scan_object (obj, cards, ctx))
                 return;
  
         HEAVY_STAT (++bloby_objects);
diff --git a/mono/sgen/sgen-client.h b/mono/sgen/sgen-client.h

index 6d60c3516c6a380c8a2de1c47827a34229ea853a..b1f46068cb3594e91a11309ae72797ada4bc0bc1 100644 (file)
--- a/mono/sgen/sgen-client.h
+++ b/mono/sgen/sgen-client.h
@@ -94,7 +94,7 @@ void sgen_client_ensure_weak_gchandles_accessible (void);
   * parts of the object based on which cards are marked, do so and return TRUE.  Otherwise,
   * return FALSE.
   */
-gboolean sgen_client_cardtable_scan_object (GCObject *obj, mword block_obj_size, guint8 *cards, ScanCopyContext ctx);
+gboolean sgen_client_cardtable_scan_object (GCObject *obj, guint8 *cards, ScanCopyContext ctx);
  
  /*
   * Called after nursery objects have been pinned.  No action is necessary.
diff --git a/mono/sgen/sgen-copy-object.h b/mono/sgen/sgen-copy-object.h

index 3579d09bae865777c4bb114fbc6370dd161f54df..4c3eae0f09e3db3c7736872a54640a0565f722f0 100644 (file)
--- a/mono/sgen/sgen-copy-object.h
+++ b/mono/sgen/sgen-copy-object.h
@@ -39,13 +39,11 @@ extern guint64 stat_nursery_copy_object_failed_pinned;
  extern guint64 stat_slots_allocated_in_vain;
  
  /*
- * Copies an object and enqueues it if a queue is given.
- *
   * This function can be used even if the vtable of obj is not valid
   * anymore, which is the case in the parallel collector.
   */
  static MONO_ALWAYS_INLINE void
-par_copy_object_no_checks (char *destination, GCVTable vt, void *obj, mword objsize, SgenGrayQueue *queue)
+par_copy_object_no_checks (char *destination, GCVTable vt, void *obj, mword objsize)
  {
         sgen_client_pre_copy_checks (destination, vt, obj, objsize);
         binary_protocol_copy (obj, destination, vt, objsize);
@@ -57,14 +55,10 @@ par_copy_object_no_checks (char *destination, GCVTable vt, void *obj, mword objs
         SGEN_ASSERT (9, sgen_vtable_get_descriptor (vt), "vtable %p has no gc descriptor", vt);
  
         sgen_client_update_copied_object (destination, vt, obj, objsize);
-       obj = destination;
-       if (queue) {
-               SGEN_LOG (9, "Enqueuing gray object %p (%s)", obj, sgen_client_vtable_get_name (vt));
-               GRAY_OBJECT_ENQUEUE (queue, (GCObject *)obj, sgen_vtable_get_descriptor (vt));
-       }
  }
  
  /*
+ * Copies an object and enqueues it if a queue is given.
   * This can return OBJ itself on OOM.
   */
  static MONO_NEVER_INLINE GCObject *
@@ -73,7 +67,6 @@ copy_object_no_checks (GCObject *obj, SgenGrayQueue *queue)
         GCVTable vt = SGEN_LOAD_VTABLE_UNCHECKED (obj);
         gboolean has_references = SGEN_VTABLE_HAS_REFERENCES (vt);
         mword objsize = SGEN_ALIGN_UP (sgen_client_par_object_get_size (vt, obj));
-       /* FIXME: Does this not mark the newly allocated object? */
         void *destination = COLLECTOR_SERIAL_ALLOC_FOR_PROMOTION (vt, obj, objsize, has_references);
  
         if (G_UNLIKELY (!destination)) {
@@ -83,17 +76,61 @@ copy_object_no_checks (GCObject *obj, SgenGrayQueue *queue)
                 return obj;
         }
  
-       if (!has_references)
-               queue = NULL;
-
-       par_copy_object_no_checks ((char *)destination, vt, obj, objsize, queue);
-       /* FIXME: mark mod union cards if necessary */
+       par_copy_object_no_checks ((char *)destination, vt, obj, objsize);
  
         /* set the forwarding pointer */
         SGEN_FORWARD_OBJECT (obj, destination);
  
+       if (has_references) {
+               SGEN_LOG (9, "Enqueuing gray object %p (%s)", destination, sgen_client_vtable_get_name (vt));
+               GRAY_OBJECT_ENQUEUE_SERIAL (queue, (GCObject *)destination, sgen_vtable_get_descriptor (vt));
+       }
+
         return (GCObject *)destination;
  }
  
+#if defined(COPY_OR_MARK_PARALLEL)
+static MONO_NEVER_INLINE GCObject *
+copy_object_no_checks_par (GCObject *obj, SgenGrayQueue *queue)
+{
+       mword vtable_word = *(mword*)obj;
+       GCObject *destination;
+
+       destination = (GCObject*) SGEN_VTABLE_IS_FORWARDED (vtable_word);
+
+       if (!destination) {
+               GCVTable vt = (GCVTable) vtable_word;
+               GCObject *final_destination;
+               /*
+                * At this point we know vt is not tagged and we shouldn't access the vtable through obj
+                * since it could get copied at any time by another thread.
+                */
+               gboolean has_references = SGEN_VTABLE_HAS_REFERENCES (vt);
+               mword objsize = SGEN_ALIGN_UP (sgen_client_par_object_get_size (vt, obj));
+               destination = major_collector.alloc_object_par (vt, objsize, has_references);
+
+               par_copy_object_no_checks ((char*)destination, vt, obj, objsize);
+
+               /* FIXME we might need a membar here so other threads see the vtable before we forward */
+
+               /* set the forwarding pointer */
+               SGEN_FORWARD_OBJECT_PAR (obj, destination, final_destination);
+
+               if (destination == final_destination) {
+                       /* In a racing case, only the worker that allocated the object enqueues it */
+                       if (has_references) {
+                               SGEN_LOG (9, "Enqueuing gray object %p (%s)", destination, sgen_client_vtable_get_name (vt));
+                               GRAY_OBJECT_ENQUEUE_PARALLEL (queue, (GCObject *)destination, sgen_vtable_get_descriptor (vt));
+                       }
+               } else {
+                       destination = final_destination;
+               }
+       }
+
+       return destination;
+}
+#endif
+
  #undef COLLECTOR_SERIAL_ALLOC_FOR_PROMOTION
  #undef collector_pin_object
+#undef COPY_OR_MARK_PARALLEL
diff --git a/mono/sgen/sgen-gc.c b/mono/sgen/sgen-gc.c

index 2fdd86c2bf0613f7e75129c417d5a3341f82c1a4..e29ec1c92005b17b6e992cc71caa7247587fd7a5 100644 (file)
--- a/mono/sgen/sgen-gc.c
+++ b/mono/sgen/sgen-gc.c
@@ -424,12 +424,11 @@ sgen_workers_get_job_gray_queue (WorkerData *worker_data, SgenGrayQueue *default
  }
  
  static void
-gray_queue_enable_redirect (SgenGrayQueue *queue)
+gray_queue_redirect (SgenGrayQueue *queue)
  {
         SGEN_ASSERT (0, concurrent_collection_in_progress, "Where are we redirecting the gray queue to, without a concurrent collection?");
  
-       sgen_gray_queue_set_alloc_prepare (queue, sgen_workers_take_from_queue_and_awake);
-       sgen_workers_take_from_queue_and_awake (queue);
+       sgen_workers_take_from_queue (queue);
  }
  
  void
@@ -511,22 +510,9 @@ sgen_add_to_global_remset (gpointer ptr, GCObject *obj)
  gboolean
  sgen_drain_gray_stack (ScanCopyContext ctx)
  {
-       ScanObjectFunc scan_func = ctx.ops->scan_object;
-       SgenGrayQueue *queue = ctx.queue;
+       SGEN_ASSERT (0, ctx.ops->drain_gray_stack, "Why do we have a scan/copy context with a missing drain gray stack function?");
  
-       if (ctx.ops->drain_gray_stack)
-               return ctx.ops->drain_gray_stack (queue);
-
-       for (;;) {
-               GCObject *obj;
-               SgenDescriptor desc;
-               GRAY_OBJECT_DEQUEUE (queue, &obj, &desc);
-               if (!obj)
-                       return TRUE;
-               SGEN_LOG (9, "Precise gray object scan %p (%s)", obj, sgen_client_vtable_get_name (SGEN_LOAD_VTABLE (obj)));
-               scan_func (obj, desc, queue);
-       }
-       return FALSE;
+       return ctx.ops->drain_gray_stack (ctx.queue);
  }
  
  /*
@@ -675,7 +661,7 @@ pin_objects_from_nursery_pin_queue (gboolean do_scan_objects, ScanCopyContext ct
                                         safe_object_get_size (obj_to_pin));
  
                         pin_object (obj_to_pin);
-                       GRAY_OBJECT_ENQUEUE (queue, obj_to_pin, desc);
+                       GRAY_OBJECT_ENQUEUE_SERIAL (queue, obj_to_pin, desc);
                         sgen_pin_stats_register_object (obj_to_pin, GENERATION_NURSERY);
                         definitely_pinned [count] = obj_to_pin;
                         count++;
@@ -725,7 +711,7 @@ sgen_pin_object (GCObject *object, SgenGrayQueue *queue)
         ++objects_pinned;
         sgen_pin_stats_register_object (object, GENERATION_NURSERY);
  
-       GRAY_OBJECT_ENQUEUE (queue, object, sgen_obj_get_descriptor_safe (object));
+       GRAY_OBJECT_ENQUEUE_SERIAL (queue, object, sgen_obj_get_descriptor_safe (object));
  }
  
  /* Sort the addresses in array in increasing order.
@@ -1324,6 +1310,11 @@ typedef struct {
         SgenGrayQueue *gc_thread_gray_queue;
  } ScanJob;
  
+typedef struct {
+       ScanJob scan_job;
+       int job_index;
+} ParallelScanJob;
+
  static ScanCopyContext
  scan_copy_context_for_scan_job (void *worker_data_untyped, ScanJob *job)
  {
@@ -1386,37 +1377,86 @@ job_scan_finalizer_entries (void *worker_data_untyped, SgenThreadPoolJob *job)
  static void
  job_scan_major_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
  {
-       ScanJob *job_data = (ScanJob*)job;
-       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
  
         g_assert (concurrent_collection_in_progress);
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx);
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
  }
  
  static void
  job_scan_los_mod_union_card_table (void *worker_data_untyped, SgenThreadPoolJob *job)
  {
-       ScanJob *job_data = (ScanJob*)job;
-       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       g_assert (concurrent_collection_in_progress);
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+}
+
+static void
+job_major_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
+
+       g_assert (concurrent_collection_in_progress);
+
+       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
+}
+
+static void
+job_los_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+{
+       ParallelScanJob *job_data = (ParallelScanJob*)job;
+       ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, (ScanJob*)job_data);
  
         g_assert (concurrent_collection_in_progress);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION, ctx);
+
+       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx, job_data->job_index, sgen_workers_get_job_split_count ());
  }
  
  static void
-job_mod_union_preclean (void *worker_data_untyped, SgenThreadPoolJob *job)
+job_scan_last_pinned (void *worker_data_untyped, SgenThreadPoolJob *job)
  {
         ScanJob *job_data = (ScanJob*)job;
         ScanCopyContext ctx = scan_copy_context_for_scan_job (worker_data_untyped, job_data);
  
         g_assert (concurrent_collection_in_progress);
  
-       major_collector.scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx);
-       sgen_los_scan_card_table (CARDTABLE_SCAN_MOD_UNION_PRECLEAN, ctx);
-
         sgen_scan_pin_queue_objects (ctx);
  }
  
+static void
+workers_finish_callback (void)
+{
+       ParallelScanJob *psj;
+       ScanJob *sj;
+       int split_count = sgen_workers_get_job_split_count ();
+       int i;
+       /* Mod union preclean jobs */
+       for (i = 0; i < split_count; i++) {
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean major mod union cardtable", job_major_mod_union_preclean, sizeof (ParallelScanJob));
+               psj->scan_job.ops = sgen_workers_get_idle_func_object_ops ();
+               psj->scan_job.gc_thread_gray_queue = NULL;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+       }
+
+       for (i = 0; i < split_count; i++) {
+               psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("preclean los mod union cardtable", job_los_mod_union_preclean, sizeof (ParallelScanJob));
+               psj->scan_job.ops = sgen_workers_get_idle_func_object_ops ();
+               psj->scan_job.gc_thread_gray_queue = NULL;
+               psj->job_index = i;
+               sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+       }
+
+       sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan last pinned", job_scan_last_pinned, sizeof (ScanJob));
+       sj->ops = sgen_workers_get_idle_func_object_ops ();
+       sj->gc_thread_gray_queue = NULL;
+       sgen_workers_enqueue_job (&sj->job, TRUE);
+}
+
  static void
  init_gray_queue (SgenGrayQueue *gc_thread_gray_queue, gboolean use_workers)
  {
@@ -1684,7 +1724,7 @@ typedef enum {
  } CopyOrMarkFromRootsMode;
  
  static void
-major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops)
+major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_next_pin_slot, CopyOrMarkFromRootsMode mode, SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par)
  {
         LOSObject *bigobj;
         TV_DECLARE (atv);
@@ -1694,7 +1734,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
          */
         char *heap_start = NULL;
         char *heap_end = (char*)-1;
-       ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue);
+       ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue);
         gboolean concurrent = mode != COPY_OR_MARK_FROM_ROOTS_SERIAL;
  
         SGEN_ASSERT (0, !!concurrent == !!concurrent_collection_in_progress, "We've been called with the wrong mode.");
@@ -1787,7 +1827,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
                         }
                         sgen_los_pin_object (bigobj->data);
                         if (SGEN_OBJECT_HAS_REFERENCES (bigobj->data))
-                               GRAY_OBJECT_ENQUEUE (gc_thread_gray_queue, bigobj->data, sgen_obj_get_descriptor ((GCObject*)bigobj->data));
+                               GRAY_OBJECT_ENQUEUE_SERIAL (gc_thread_gray_queue, bigobj->data, sgen_obj_get_descriptor ((GCObject*)bigobj->data));
                         sgen_pin_stats_register_object (bigobj->data, GENERATION_OLD);
                         SGEN_LOG (6, "Marked large object %p (%s) size: %lu from roots", bigobj->data,
                                         sgen_client_vtable_get_name (SGEN_LOAD_VTABLE (bigobj->data)),
@@ -1814,12 +1854,14 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
  
         SGEN_ASSERT (0, sgen_workers_all_done (), "Why are the workers not done when we start or finish a major collection?");
         if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
+               sgen_workers_set_num_active_workers (0);
                 if (sgen_workers_have_idle_work ()) {
                         /*
                          * We force the finish of the worker with the new object ops context
                          * which can also do copying. We need to have finished pinning.
                          */
-                       sgen_workers_start_all_workers (object_ops, NULL);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+
                         sgen_workers_join ();
                 }
         }
@@ -1835,7 +1877,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
  
         sgen_client_collecting_major_3 (&fin_ready_queue, &critical_fin_queue);
  
-       enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops, FALSE);
+       enqueue_scan_from_roots_jobs (gc_thread_gray_queue, heap_start, heap_end, object_ops_nopar, FALSE);
  
         TV_GETTIME (btv);
         time_major_scan_roots += TV_ELAPSED (atv, btv);
@@ -1846,35 +1888,47 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
          * the roots.
          */
         if (mode == COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT) {
+               sgen_workers_set_num_active_workers (1);
+               gray_queue_redirect (gc_thread_gray_queue);
                 if (precleaning_enabled) {
-                       ScanJob *sj;
-                       /* Mod union preclean job */
-                       sj = (ScanJob*)sgen_thread_pool_job_alloc ("preclean mod union cardtable", job_mod_union_preclean, sizeof (ScanJob));
-                       sj->ops = object_ops;
-                       sj->gc_thread_gray_queue = NULL;
-                       sgen_workers_start_all_workers (object_ops, &sj->job);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, workers_finish_callback);
                 } else {
-                       sgen_workers_start_all_workers (object_ops, NULL);
+                       sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
                 }
-               gray_queue_enable_redirect (gc_thread_gray_queue);
         }
  
         if (mode == COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT) {
-               ScanJob *sj;
+               int i, split_count = sgen_workers_get_job_split_count ();
+
+               gray_queue_redirect (gc_thread_gray_queue);
  
                 /* Mod union card table */
-               sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ScanJob));
-               sj->ops = object_ops;
-               sj->gc_thread_gray_queue = gc_thread_gray_queue;
-               sgen_workers_enqueue_job (&sj->job, FALSE);
-
-               sj = (ScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ScanJob));
-               sj->ops = object_ops;
-               sj->gc_thread_gray_queue = gc_thread_gray_queue;
-               sgen_workers_enqueue_job (&sj->job, FALSE);
-
-               TV_GETTIME (atv);
-               time_major_scan_mod_union += TV_ELAPSED (btv, atv);
+               for (i = 0; i < split_count; i++) {
+                       ParallelScanJob *psj;
+
+                       psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan mod union cardtable", job_scan_major_mod_union_card_table, sizeof (ParallelScanJob));
+                       psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar;
+                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->job_index = i;
+                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+
+                       psj = (ParallelScanJob*)sgen_thread_pool_job_alloc ("scan LOS mod union cardtable", job_scan_los_mod_union_card_table, sizeof (ParallelScanJob));
+                       psj->scan_job.ops = object_ops_par ? object_ops_par : object_ops_nopar;
+                       psj->scan_job.gc_thread_gray_queue = NULL;
+                       psj->job_index = i;
+                       sgen_workers_enqueue_job (&psj->scan_job.job, TRUE);
+               }
+
+               /*
+                * If we enqueue a job while workers are running we need to sgen_workers_ensure_awake
+                * in order to make sure that we are running the idle func and draining all worker
+                * gray queues. The operation of starting workers implies this, so we start them after
+                * in order to avoid doing this operation twice. The workers will drain the main gray
+                * stack that contained roots and pinned objects and also scan the mod union card
+                * table.
+                */
+               sgen_workers_start_all_workers (object_ops_nopar, object_ops_par, NULL);
+               sgen_workers_join ();
         }
  
         sgen_pin_stats_report ();
@@ -1892,7 +1946,7 @@ major_copy_or_mark_from_roots (SgenGrayQueue *gc_thread_gray_queue, size_t *old_
  static void
  major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean concurrent, size_t *old_next_pin_slot)
  {
-       SgenObjectOperations *object_ops;
+       SgenObjectOperations *object_ops_nopar, *object_ops_par = NULL;
  
         binary_protocol_collection_begin (gc_stats.major_gc_count, GENERATION_OLD);
  
@@ -1907,9 +1961,12 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
                 g_assert (major_collector.is_concurrent);
                 concurrent_collection_in_progress = TRUE;
  
-               object_ops = &major_collector.major_ops_concurrent_start;
+               object_ops_nopar = &major_collector.major_ops_concurrent_start;
+               if (major_collector.is_parallel)
+                       object_ops_par = &major_collector.major_ops_conc_par_start;
+
         } else {
-               object_ops = &major_collector.major_ops_serial;
+               object_ops_nopar = &major_collector.major_ops_serial;
         }
  
         reset_pinned_from_failed_allocation ();
@@ -1928,14 +1985,14 @@ major_start_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason,
         if (major_collector.start_major_collection)
                 major_collector.start_major_collection ();
  
-       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops);
+       major_copy_or_mark_from_roots (gc_thread_gray_queue, old_next_pin_slot, concurrent ? COPY_OR_MARK_FROM_ROOTS_START_CONCURRENT : COPY_OR_MARK_FROM_ROOTS_SERIAL, object_ops_nopar, object_ops_par);
  }
  
  static void
  major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason, gboolean is_overflow, size_t old_next_pin_slot, gboolean forced)
  {
         ScannedObjectCounts counts;
-       SgenObjectOperations *object_ops;
+       SgenObjectOperations *object_ops_nopar;
         mword fragment_total;
         TV_DECLARE (atv);
         TV_DECLARE (btv);
@@ -1943,20 +2000,24 @@ major_finish_collection (SgenGrayQueue *gc_thread_gray_queue, const char *reason
         TV_GETTIME (btv);
  
         if (concurrent_collection_in_progress) {
-               object_ops = &major_collector.major_ops_concurrent_finish;
+               SgenObjectOperations *object_ops_par = NULL;
  
-               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops);
+               object_ops_nopar = &major_collector.major_ops_concurrent_finish;
+               if (major_collector.is_parallel)
+                       object_ops_par = &major_collector.major_ops_conc_par_finish;
+
+               major_copy_or_mark_from_roots (gc_thread_gray_queue, NULL, COPY_OR_MARK_FROM_ROOTS_FINISH_CONCURRENT, object_ops_nopar, object_ops_par);
  
  #ifdef SGEN_DEBUG_INTERNAL_ALLOC
                 main_gc_thread = NULL;
  #endif
         } else {
-               object_ops = &major_collector.major_ops_serial;
+               object_ops_nopar = &major_collector.major_ops_serial;
         }
  
         sgen_workers_assert_gray_queue_is_empty ();
  
-       finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops, gc_thread_gray_queue));
+       finish_gray_stack (GENERATION_OLD, CONTEXT_FROM_OBJECT_OPERATIONS (object_ops_nopar, gc_thread_gray_queue));
         TV_GETTIME (atv);
         time_major_finish_gray_stack += TV_ELAPSED (btv, atv);
  
@@ -2850,6 +2911,8 @@ sgen_gc_init (void)
                 sgen_marksweep_init (&major_collector);
         } else if (!strcmp (major_collector_opt, "marksweep-conc")) {
                 sgen_marksweep_conc_init (&major_collector);
+       } else if (!strcmp (major_collector_opt, "marksweep-conc-par")) {
+               sgen_marksweep_conc_par_init (&major_collector);
         } else {
                 sgen_env_var_error (MONO_GC_PARAMS_NAME, "Using `" DEFAULT_MAJOR_NAME "` instead.", "Unknown major collector `%s'.", major_collector_opt);
                 goto use_default_major;
@@ -3156,8 +3219,16 @@ sgen_gc_init (void)
         if (major_collector.post_param_init)
                 major_collector.post_param_init (&major_collector);
  
-       if (major_collector.needs_thread_pool)
-               sgen_workers_init (1);
+       if (major_collector.needs_thread_pool) {
+               int num_workers = 1;
+               if (major_collector.is_parallel) {
+                       /* FIXME Detect the number of physical cores, instead of logical */
+                       num_workers = mono_cpu_count () / 2;
+                       if (num_workers < 1)
+                               num_workers = 1;
+               }
+               sgen_workers_init (num_workers, (SgenWorkerCallback) major_collector.worker_init_cb);
+       }
  
         sgen_memgov_init (max_heap, soft_limit, debug_print_allowance, allowance_ratio, save_target);
  
diff --git a/mono/sgen/sgen-gc.h b/mono/sgen/sgen-gc.h

index e5bcfcf134f394421685961aaf75679c678802eb..ba7aee729406a55f2ae6bde264d7219e9e349620 100644 (file)
--- a/mono/sgen/sgen-gc.h
+++ b/mono/sgen/sgen-gc.h
@@ -254,7 +254,7 @@ sgen_get_nursery_end (void)
  #define SGEN_POINTER_UNTAG_VTABLE(p)           SGEN_POINTER_UNTAG_ALL((p))
  
  /* returns NULL if not forwarded, or the forwarded address */
-#define SGEN_VTABLE_IS_FORWARDED(vtable) ((GCVTable *)(SGEN_POINTER_IS_TAGGED_FORWARDED ((vtable)) ? SGEN_POINTER_UNTAG_VTABLE ((vtable)) : NULL))
+#define SGEN_VTABLE_IS_FORWARDED(vtable) ((GCObject *)(SGEN_POINTER_IS_TAGGED_FORWARDED ((vtable)) ? SGEN_POINTER_UNTAG_VTABLE ((vtable)) : NULL))
  #define SGEN_OBJECT_IS_FORWARDED(obj) ((GCObject *)SGEN_VTABLE_IS_FORWARDED (((mword*)(obj))[0]))
  
  #define SGEN_VTABLE_IS_PINNED(vtable) SGEN_POINTER_IS_TAGGED_PINNED ((vtable))
@@ -266,6 +266,18 @@ sgen_get_nursery_end (void)
  #define SGEN_FORWARD_OBJECT(obj,fw_addr) do {                          \
                 *(void**)(obj) = SGEN_POINTER_TAG_FORWARDED ((fw_addr));        \
         } while (0)
+#define SGEN_FORWARD_OBJECT_PAR(obj,fw_addr,final_fw_addr) do {                        \
+               gpointer old_vtable_word = *(gpointer*)obj;                     \
+               gpointer new_vtable_word;                                       \
+               final_fw_addr = SGEN_VTABLE_IS_FORWARDED (old_vtable_word);     \
+               if (final_fw_addr)                                              \
+                       break;                                                  \
+               new_vtable_word = SGEN_POINTER_TAG_FORWARDED ((fw_addr));       \
+               old_vtable_word = InterlockedCompareExchangePointer ((gpointer*)obj, new_vtable_word, old_vtable_word); \
+               final_fw_addr = SGEN_VTABLE_IS_FORWARDED (old_vtable_word);     \
+               if (!final_fw_addr)                                             \
+                       final_fw_addr = (fw_addr);                              \
+       } while (0)
  #define SGEN_PIN_OBJECT(obj) do {      \
                 *(void**)(obj) = SGEN_POINTER_TAG_PINNED (*(void**)(obj)); \
         } while (0)
@@ -620,6 +632,7 @@ typedef struct _SgenMajorCollector SgenMajorCollector;
  struct _SgenMajorCollector {
         size_t section_size;
         gboolean is_concurrent;
+       gboolean is_parallel;
         gboolean needs_thread_pool;
         gboolean supports_cardtable;
         gboolean sweeps_lazily;
@@ -632,8 +645,11 @@ struct _SgenMajorCollector {
         SgenObjectOperations major_ops_serial;
         SgenObjectOperations major_ops_concurrent_start;
         SgenObjectOperations major_ops_concurrent_finish;
+       SgenObjectOperations major_ops_conc_par_start;
+       SgenObjectOperations major_ops_conc_par_finish;
  
         GCObject* (*alloc_object) (GCVTable vtable, size_t size, gboolean has_references);
+       GCObject* (*alloc_object_par) (GCVTable vtable, size_t size, gboolean has_references);
         void (*free_pinned_object) (GCObject *obj, size_t size);
  
         /*
@@ -645,7 +661,7 @@ struct _SgenMajorCollector {
         void (*free_non_pinned_object) (GCObject *obj, size_t size);
         void (*pin_objects) (SgenGrayQueue *queue);
         void (*pin_major_object) (GCObject *obj, SgenGrayQueue *queue);
-       void (*scan_card_table) (CardTableScanType scan_type, ScanCopyContext ctx);
+       void (*scan_card_table) (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count);
         void (*iterate_live_block_ranges) (sgen_cardtable_block_callback callback);
         void (*iterate_block_ranges) (sgen_cardtable_block_callback callback);
         void (*update_cardtable_mod_union) (void);
@@ -674,15 +690,15 @@ struct _SgenMajorCollector {
         guint8* (*get_cardtable_mod_union_for_reference) (char *object);
         long long (*get_and_reset_num_major_objects_marked) (void);
         void (*count_cards) (long long *num_total_cards, long long *num_marked_cards);
+
+       void (*worker_init_cb) (gpointer worker);
  };
  
  extern SgenMajorCollector major_collector;
  
  void sgen_marksweep_init (SgenMajorCollector *collector);
-void sgen_marksweep_fixed_init (SgenMajorCollector *collector);
-void sgen_marksweep_par_init (SgenMajorCollector *collector);
-void sgen_marksweep_fixed_par_init (SgenMajorCollector *collector);
  void sgen_marksweep_conc_init (SgenMajorCollector *collector);
+void sgen_marksweep_conc_par_init (SgenMajorCollector *collector);
  SgenMajorCollector* sgen_get_major_collector (void);
  
  
@@ -737,12 +753,18 @@ static mword sgen_client_slow_object_get_size (GCVTable vtable, GCObject* o);
  static inline mword
  sgen_safe_object_get_size (GCObject *obj)
  {
-       GCObject *forwarded;
-
-       if ((forwarded = SGEN_OBJECT_IS_FORWARDED (obj)))
-               obj = forwarded;
+       GCObject *forwarded;
+       GCVTable vtable = SGEN_LOAD_VTABLE_UNCHECKED (obj);
  
-       return sgen_client_par_object_get_size (SGEN_LOAD_VTABLE (obj), obj);
+       /*
+        * Once we load the vtable, we must always use it, in case we are in parallel case.
+        * Otherwise the object might get forwarded in the meantime and we would read an
+        * invalid vtable. An object cannot be forwarded for a second time during same GC.
+        */
+       if ((forwarded = SGEN_VTABLE_IS_FORWARDED (vtable)))
+               return sgen_client_par_object_get_size (SGEN_LOAD_VTABLE (forwarded), obj);
+       else
+               return sgen_client_par_object_get_size ((GCVTable)SGEN_POINTER_UNTAG_ALL (vtable), obj);
  }
  
  static inline gboolean
@@ -865,7 +887,7 @@ void sgen_los_sweep (void);
  gboolean sgen_ptr_is_in_los (char *ptr, char **start);
  void sgen_los_iterate_objects (IterateObjectCallbackFunc cb, void *user_data);
  void sgen_los_iterate_live_block_ranges (sgen_cardtable_block_callback callback);
-void sgen_los_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx);
+void sgen_los_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count);
  void sgen_los_update_cardtable_mod_union (void);
  void sgen_los_count_cards (long long *num_total_cards, long long *num_marked_cards);
  gboolean sgen_los_is_valid_object (char *object);
@@ -873,6 +895,7 @@ gboolean mono_sgen_los_describe_pointer (char *ptr);
  LOSObject* sgen_los_header_for_object (GCObject *data);
  mword sgen_los_object_size (LOSObject *obj);
  void sgen_los_pin_object (GCObject *obj);
+gboolean sgen_los_pin_object_par (GCObject *obj);
  gboolean sgen_los_object_is_pinned (GCObject *obj);
  void sgen_los_mark_mod_union_card (GCObject *mono_obj, void **ptr);
  
diff --git a/mono/sgen/sgen-gray.c b/mono/sgen/sgen-gray.c

index f8f04c145807db19fb27a7676f73d36f77555a05..256b18ecbe6e86965c1aef14bbfb5796442d3399 100644 (file)
--- a/mono/sgen/sgen-gray.c
+++ b/mono/sgen/sgen-gray.c
@@ -45,13 +45,10 @@ guint64 stat_gray_queue_dequeue_slow_path;
  static GrayQueueSection *last_gray_queue_free_list;
  
  void
-sgen_gray_object_alloc_queue_section (SgenGrayQueue *queue)
+sgen_gray_object_alloc_queue_section (SgenGrayQueue *queue, gboolean is_parallel)
  {
         GrayQueueSection *section;
  
-       if (queue->alloc_prepare_func)
-               queue->alloc_prepare_func (queue);
-
         if (queue->free_list) {
                 /* Use the previously allocated queue sections if possible */
                 section = queue->free_list;
@@ -65,14 +62,34 @@ sgen_gray_object_alloc_queue_section (SgenGrayQueue *queue)
                 STATE_SET (section, GRAY_QUEUE_SECTION_STATE_FLOATING);
         }
  
-       section->size = SGEN_GRAY_QUEUE_SECTION_SIZE;
+       /* Section is empty */
+       section->size = 0;
  
         STATE_TRANSITION (section, GRAY_QUEUE_SECTION_STATE_FLOATING, GRAY_QUEUE_SECTION_STATE_ENQUEUED);
  
         /* Link it with the others */
         section->next = queue->first;
+       section->prev = NULL;
+       if (queue->first)
+               queue->first->prev = section;
+       else
+               queue->last = section;
         queue->first = section;
         queue->cursor = section->entries - 1;
+
+       if (is_parallel) {
+               mono_memory_write_barrier ();
+               /*
+                * FIXME
+                * we could probably optimize the code to only rely on the write barrier
+                * for synchronization with the stealer thread. Additionally we could also
+                * do a write barrier once every other gray queue change, and request
+                * to have a minimum of sections before stealing, to keep consistency.
+                */
+               InterlockedIncrement (&queue->num_sections);
+       } else {
+               queue->num_sections++;
+       }
  }
  
  void
@@ -91,7 +108,7 @@ sgen_gray_object_free_queue_section (GrayQueueSection *section)
   */
  
  void
-sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc)
+sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc, gboolean is_parallel)
  {
         GrayQueueEntry entry = SGEN_GRAY_QUEUE_ENTRY (obj, desc);
  
@@ -107,11 +124,15 @@ sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor de
  
         if (G_UNLIKELY (!queue->first || queue->cursor == GRAY_LAST_CURSOR_POSITION (queue->first))) {
                 if (queue->first) {
-                       /* Set the current section size back to default, might have been changed by sgen_gray_object_dequeue_section */
+                       /*
+                        * We don't actively update the section size with each push/pop. For the first
+                        * section we determine the size from the cursor position. For the reset of the
+                        * sections we need to have the size set.
+                        */
                         queue->first->size = SGEN_GRAY_QUEUE_SECTION_SIZE;
                 }
  
-               sgen_gray_object_alloc_queue_section (queue);
+               sgen_gray_object_alloc_queue_section (queue, is_parallel);
         }
         STATE_ASSERT (queue->first, GRAY_QUEUE_SECTION_STATE_ENQUEUED);
         SGEN_ASSERT (9, queue->cursor <= GRAY_LAST_CURSOR_POSITION (queue->first), "gray queue %p overflow, first %p, cursor %p", queue, queue->first, queue->cursor);
@@ -122,8 +143,70 @@ sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor de
  #endif
  }
  
+/*
+ * We attempt to spread the objects in the gray queue across a number
+ * of sections. If the queue has more sections, then it's already spread,
+ * if it doesn't have enough sections, then we allocate as many as we
+ * can.
+ */
+void
+sgen_gray_object_spread (SgenGrayQueue *queue, int num_sections)
+{
+       GrayQueueSection *section_start, *section_end;
+       int total_entries = 0, num_entries_per_section;
+       int num_sections_final;
+
+       if (queue->num_sections >= num_sections)
+               return;
+
+       if (!queue->first)
+               return;
+
+       /* Compute number of elements in the gray queue */
+       queue->first->size = queue->cursor - queue->first->entries + 1;
+       total_entries = queue->first->size;
+       for (section_start = queue->first->next; section_start != NULL; section_start = section_start->next) {
+               SGEN_ASSERT (0, section_start->size == SGEN_GRAY_QUEUE_SECTION_SIZE, "We expect all section aside from the first one to be full");
+               total_entries += section_start->size;
+       }
+
+       /* Compute how many sections we should have and elements per section */
+       num_sections_final = (total_entries > num_sections) ? num_sections : total_entries;
+       num_entries_per_section = total_entries / num_sections_final;
+
+       /* Allocate all needed sections */
+       while (queue->num_sections < num_sections_final)
+               sgen_gray_object_alloc_queue_section (queue, TRUE);
+
+       /* Spread out the elements in the sections. By design, sections at the end are fuller. */
+       section_start = queue->first;
+       section_end = queue->last;
+       while (section_start != section_end) {
+               /* We move entries from end to start, until they meet */
+               while (section_start->size < num_entries_per_section) {
+                       GrayQueueEntry entry;
+                       if (section_end->size <= num_entries_per_section) {
+                               section_end = section_end->prev;
+                               if (section_end == section_start)
+                                       break;
+                       }
+                       if (section_end->size <= num_entries_per_section)
+                               break;
+
+                       section_end->size--;
+                       entry = section_end->entries [section_end->size];
+                       section_start->entries [section_start->size] = entry;
+                       section_start->size++;
+               }
+               section_start = section_start->next;
+       }
+
+       queue->cursor = queue->first->entries + queue->first->size - 1;
+       queue->num_sections = num_sections_final;
+}
+
  GrayQueueEntry
-sgen_gray_object_dequeue (SgenGrayQueue *queue)
+sgen_gray_object_dequeue (SgenGrayQueue *queue, gboolean is_parallel)
  {
         GrayQueueEntry entry;
  
@@ -144,14 +227,36 @@ sgen_gray_object_dequeue (SgenGrayQueue *queue)
  #endif
  
         if (G_UNLIKELY (queue->cursor < GRAY_FIRST_CURSOR_POSITION (queue->first))) {
-               GrayQueueSection *section = queue->first;
+               GrayQueueSection *section;
+               gint32 old_num_sections = 0;
+
+               if (is_parallel)
+                       old_num_sections = InterlockedDecrement (&queue->num_sections);
+               else
+                       queue->num_sections--;
+
+               if (is_parallel && old_num_sections <= 0) {
+                       mono_os_mutex_lock (&queue->steal_mutex);
+               }
+
+               section = queue->first;
                 queue->first = section->next;
+               if (queue->first) {
+                       queue->first->prev = NULL;
+               } else {
+                       queue->last = NULL;
+                       SGEN_ASSERT (0, !old_num_sections, "Why do we have an inconsistent number of sections ?");
+               }
                 section->next = queue->free_list;
  
                 STATE_TRANSITION (section, GRAY_QUEUE_SECTION_STATE_ENQUEUED, GRAY_QUEUE_SECTION_STATE_FREE_LIST);
  
                 queue->free_list = section;
                 queue->cursor = queue->first ? queue->first->entries + queue->first->size - 1 : NULL;
+
+               if (is_parallel && old_num_sections <= 0) {
+                       mono_os_mutex_unlock (&queue->steal_mutex);
+               }
         }
  
         return entry;
@@ -165,8 +270,15 @@ sgen_gray_object_dequeue_section (SgenGrayQueue *queue)
         if (!queue->first)
                 return NULL;
  
+       /* We never steal from this queue */
+       queue->num_sections--;
+
         section = queue->first;
         queue->first = section->next;
+       if (queue->first)
+               queue->first->prev = NULL;
+       else
+               queue->last = NULL;
  
         section->next = NULL;
         section->size = queue->cursor - section->entries + 1;
@@ -178,8 +290,57 @@ sgen_gray_object_dequeue_section (SgenGrayQueue *queue)
         return section;
  }
  
+GrayQueueSection*
+sgen_gray_object_steal_section (SgenGrayQueue *queue)
+{
+       gint32 sections_remaining;
+       GrayQueueSection *section = NULL;
+
+       /*
+        * With each push/pop into the queue we increment the number of sections.
+        * There is only one thread accessing the top (the owner) and potentially
+        * multiple workers trying to steal sections from the bottom, so we need
+        * to lock. A num sections decrement from the owner means that the first
+        * section is reserved, while a decrement by the stealer means that the
+        * last section is reserved. If after we decrement the num sections, we
+        * have at least one more section present, it means we can't race with
+        * the other thread. If this is not the case the steal end abandons the
+        * pop, setting back the num_sections, while the owner end will take a
+        * lock to make sure we are not racing with the stealer (since the stealer
+        * might have popped an entry and be in the process of updating the entry
+        * that the owner is trying to pop.
+        */
+
+       if (queue->num_sections <= 1)
+               return NULL;
+
+       /* Give up if there is contention on the last section */
+       if (mono_os_mutex_trylock (&queue->steal_mutex) != 0)
+               return NULL;
+
+       sections_remaining = InterlockedDecrement (&queue->num_sections);
+       if (sections_remaining <= 0) {
+               /* The section that we tried to steal might be the head of the queue. */
+               InterlockedIncrement (&queue->num_sections);
+       } else {
+               /* We have reserved for us the tail section of the queue */
+               section = queue->last;
+               SGEN_ASSERT (0, section, "Why we don't have any sections to steal?");
+               SGEN_ASSERT (0, !section->next, "Why aren't we stealing the tail?");
+               queue->last = section->prev;
+               section->prev = NULL;
+               SGEN_ASSERT (0, queue->last, "Why are we stealing the last section?");
+               queue->last->next = NULL;
+
+               STATE_TRANSITION (section, GRAY_QUEUE_SECTION_STATE_ENQUEUED, GRAY_QUEUE_SECTION_STATE_FLOATING);
+       }
+
+       mono_os_mutex_unlock (&queue->steal_mutex);
+       return section;
+}
+
  void
-sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *section)
+sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *section, gboolean is_parallel)
  {
         STATE_TRANSITION (section, GRAY_QUEUE_SECTION_STATE_FLOATING, GRAY_QUEUE_SECTION_STATE_ENQUEUED);
  
@@ -187,6 +348,11 @@ sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *sectio
                 queue->first->size = queue->cursor - queue->first->entries + 1;
  
         section->next = queue->first;
+       section->prev = NULL;
+       if (queue->first)
+               queue->first->prev = section;
+       else
+               queue->last = section;
         queue->first = section;
         queue->cursor = queue->first->entries + queue->first->size - 1;
  #ifdef SGEN_CHECK_GRAY_OBJECT_ENQUEUE
@@ -196,6 +362,12 @@ sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *sectio
                         queue->enqueue_check_func (section->entries [i].obj);
         }
  #endif
+       if (is_parallel) {
+               mono_memory_write_barrier ();
+               InterlockedIncrement (&queue->num_sections);
+       } else {
+               queue->num_sections++;
+       }
  }
  
  void
@@ -226,6 +398,8 @@ sgen_gray_object_queue_init (SgenGrayQueue *queue, GrayQueueEnqueueCheckFunc enq
         queue->enqueue_check_func = enqueue_check_func;
  #endif
  
+       mono_os_mutex_init (&queue->steal_mutex);
+
         if (reuse_free_list) {
                 queue->free_list = last_gray_queue_free_list;
                 last_gray_queue_free_list = NULL;
@@ -247,13 +421,6 @@ sgen_gray_object_queue_dispose (SgenGrayQueue *queue)
         memset (queue, 0, sizeof (SgenGrayQueue));
  }
  
-void
-sgen_gray_queue_set_alloc_prepare (SgenGrayQueue *queue, GrayQueueAllocPrepareFunc alloc_prepare_func)
-{
-       SGEN_ASSERT (0, !queue->alloc_prepare_func, "Can't set gray queue alloc-prepare twice");
-       queue->alloc_prepare_func = alloc_prepare_func;
-}
-
  void
  sgen_gray_object_queue_deinit (SgenGrayQueue *queue)
  {
diff --git a/mono/sgen/sgen-gray.h b/mono/sgen/sgen-gray.h

index 2a872d7d2bf1816193f06f8dfebec7a035cfb4d1..b2f69d0586d2dd296a42c0d573fbc4c78a2d677e 100644 (file)
--- a/mono/sgen/sgen-gray.h
+++ b/mono/sgen/sgen-gray.h
@@ -41,9 +41,9 @@
  
  /* SGEN_GRAY_QUEUE_HEADER_SIZE is number of machine words */
  #ifdef SGEN_CHECK_GRAY_OBJECT_SECTIONS
-#define SGEN_GRAY_QUEUE_HEADER_SIZE    4
+#define SGEN_GRAY_QUEUE_HEADER_SIZE    5
  #else
-#define SGEN_GRAY_QUEUE_HEADER_SIZE    2
+#define SGEN_GRAY_QUEUE_HEADER_SIZE    3
  #endif
  
  #define SGEN_GRAY_QUEUE_SECTION_SIZE   (128 - SGEN_GRAY_QUEUE_HEADER_SIZE)
@@ -65,6 +65,11 @@ struct _GrayQueueEntry {
  
  #define SGEN_GRAY_QUEUE_ENTRY(obj,desc)        { (obj), (desc) }
  
+#define GRAY_OBJECT_ENQUEUE_SERIAL(queue, obj, desc) (GRAY_OBJECT_ENQUEUE (queue, obj, desc, FALSE))
+#define GRAY_OBJECT_ENQUEUE_PARALLEL(queue, obj, desc) (GRAY_OBJECT_ENQUEUE (queue, obj, desc, TRUE))
+#define GRAY_OBJECT_DEQUEUE_SERIAL(queue, obj, desc) (GRAY_OBJECT_DEQUEUE (queue, obj, desc, FALSE))
+#define GRAY_OBJECT_DEQUEUE_PARALLEL(queue, obj, desc) (GRAY_OBJECT_DEQUEUE (queue, obj, desc, TRUE))
+
  /*
   * This is a stack now instead of a queue, so the most recently added items are removed
   * first, improving cache locality, and keeping the stack size manageable.
@@ -80,7 +85,7 @@ struct _GrayQueueSection {
         GrayQueueSectionState state;
  #endif
         int size;
-       GrayQueueSection *next;
+       GrayQueueSection *next, *prev;
         GrayQueueEntry entries [SGEN_GRAY_QUEUE_SECTION_SIZE];
  };
  
@@ -91,9 +96,10 @@ typedef void (*GrayQueueEnqueueCheckFunc) (GCObject*);
  
  struct _SgenGrayQueue {
         GrayQueueEntry *cursor;
-       GrayQueueSection *first;
+       GrayQueueSection *first, *last;
         GrayQueueSection *free_list;
-       GrayQueueAllocPrepareFunc alloc_prepare_func;
+       mono_mutex_t steal_mutex;
+       gint32 num_sections;
  #ifdef SGEN_CHECK_GRAY_OBJECT_ENQUEUE
         GrayQueueEnqueueCheckFunc enqueue_check_func;
  #endif
@@ -124,16 +130,17 @@ extern guint64 stat_gray_queue_dequeue_slow_path;
  
  void sgen_init_gray_queues (void);
  
-void sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc);
-GrayQueueEntry sgen_gray_object_dequeue (SgenGrayQueue *queue);
+void sgen_gray_object_enqueue (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc, gboolean is_parallel);
+GrayQueueEntry sgen_gray_object_dequeue (SgenGrayQueue *queue, gboolean is_parallel);
  GrayQueueSection* sgen_gray_object_dequeue_section (SgenGrayQueue *queue);
-void sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *section);
+GrayQueueSection* sgen_gray_object_steal_section (SgenGrayQueue *queue);
+void sgen_gray_object_spread (SgenGrayQueue *queue, int num_sections);
+void sgen_gray_object_enqueue_section (SgenGrayQueue *queue, GrayQueueSection *section, gboolean is_parallel);
  void sgen_gray_object_queue_trim_free_list (SgenGrayQueue *queue);
  void sgen_gray_object_queue_init (SgenGrayQueue *queue, GrayQueueEnqueueCheckFunc enqueue_check_func, gboolean reuse_free_list);
  void sgen_gray_object_queue_dispose (SgenGrayQueue *queue);
-void sgen_gray_queue_set_alloc_prepare (SgenGrayQueue *queue, GrayQueueAllocPrepareFunc alloc_prepare_func);
  void sgen_gray_object_queue_deinit (SgenGrayQueue *queue);
-void sgen_gray_object_alloc_queue_section (SgenGrayQueue *queue);
+void sgen_gray_object_alloc_queue_section (SgenGrayQueue *queue, gboolean is_parallel);
  void sgen_gray_object_free_queue_section (GrayQueueSection *section);
  
  void sgen_section_gray_queue_init (SgenSectionGrayQueue *queue, gboolean locked,
@@ -151,13 +158,13 @@ sgen_gray_object_queue_is_empty (SgenGrayQueue *queue)
  }
  
  static inline MONO_ALWAYS_INLINE void
-GRAY_OBJECT_ENQUEUE (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc)
+GRAY_OBJECT_ENQUEUE (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc, gboolean is_parallel)
  {
  #if SGEN_MAX_DEBUG_LEVEL >= 9
-       sgen_gray_object_enqueue (queue, obj, desc);
+       sgen_gray_object_enqueue (queue, obj, desc, is_parallel);
  #else
         if (G_UNLIKELY (!queue->first || queue->cursor == GRAY_LAST_CURSOR_POSITION (queue->first))) {
-               sgen_gray_object_enqueue (queue, obj, desc);
+               sgen_gray_object_enqueue (queue, obj, desc, is_parallel);
         } else {
                 GrayQueueEntry entry = SGEN_GRAY_QUEUE_ENTRY (obj, desc);
  
@@ -172,11 +179,11 @@ GRAY_OBJECT_ENQUEUE (SgenGrayQueue *queue, GCObject *obj, SgenDescriptor desc)
  }
  
  static inline MONO_ALWAYS_INLINE void
-GRAY_OBJECT_DEQUEUE (SgenGrayQueue *queue, GCObject** obj, SgenDescriptor *desc)
+GRAY_OBJECT_DEQUEUE (SgenGrayQueue *queue, GCObject** obj, SgenDescriptor *desc, gboolean is_parallel)
  {
         GrayQueueEntry entry;
  #if SGEN_MAX_DEBUG_LEVEL >= 9
-       entry = sgen_gray_object_dequeue (queue);
+       entry = sgen_gray_object_dequeue (queue, is_parallel);
         *obj = entry.obj;
         *desc = entry.desc;
  #else
@@ -185,7 +192,7 @@ GRAY_OBJECT_DEQUEUE (SgenGrayQueue *queue, GCObject** obj, SgenDescriptor *desc)
  
                 *obj = NULL;
         } else if (G_UNLIKELY (queue->cursor == GRAY_FIRST_CURSOR_POSITION (queue->first))) {
-               entry = sgen_gray_object_dequeue (queue);
+               entry = sgen_gray_object_dequeue (queue, is_parallel);
                 *obj = entry.obj;
                 *desc = entry.desc;
         } else {
diff --git a/mono/sgen/sgen-los.c b/mono/sgen/sgen-los.c

index 454cb89fda70eb88ce31bd6277b148fb323acf64..e61ba557818f7ac6516d3c5ccff295d113cde566 100644 (file)
--- a/mono/sgen/sgen-los.c
+++ b/mono/sgen/sgen-los.c
@@ -624,15 +624,19 @@ get_cardtable_mod_union_for_object (LOSObject *obj)
  }
  
  void
-sgen_los_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx)
+sgen_los_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count)
  {
         LOSObject *obj;
+       int i = 0;
  
         binary_protocol_los_card_table_scan_start (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
-       for (obj = los_object_list; obj; obj = obj->next) {
+       for (obj = los_object_list; obj; obj = obj->next, i++) {
                 mword num_cards = 0;
                 guint8 *cards;
  
+               if (i % job_split_count != job_index)
+                       continue;
+
                 if (!SGEN_OBJECT_HAS_REFERENCES (obj->data))
                         continue;
  
@@ -640,6 +644,9 @@ sgen_los_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx)
                         if (!sgen_los_object_is_pinned (obj->data))
                                 continue;
  
+                       if (!obj->cardtable_mod_union)
+                               continue;
+
                         cards = get_cardtable_mod_union_for_object (obj);
                         g_assert (cards);
                         if (scan_type == CARDTABLE_SCAN_MOD_UNION_PRECLEAN) {
@@ -722,6 +729,24 @@ sgen_los_pin_object (GCObject *data)
         binary_protocol_pin (data, (gpointer)SGEN_LOAD_VTABLE (data), sgen_safe_object_get_size (data));
  }
  
+gboolean
+sgen_los_pin_object_par (GCObject *data)
+{
+       LOSObject *obj = sgen_los_header_for_object (data);
+       mword old_size = obj->size;
+       if (old_size & 1)
+               return FALSE;
+#if SIZEOF_VOID_P == 4
+       old_size = InterlockedCompareExchange ((volatile gint32*)&obj->size, old_size | 1, old_size);
+#else
+       old_size = InterlockedCompareExchange64 ((volatile gint64*)&obj->size, old_size | 1, old_size);
+#endif
+       if (old_size & 1)
+               return FALSE;
+       binary_protocol_pin (data, (gpointer)SGEN_LOAD_VTABLE (data), sgen_safe_object_get_size (data));
+       return TRUE;
+}
+
  static void
  sgen_los_unpin_object (GCObject *data)
  {
diff --git a/mono/sgen/sgen-marksweep-drain-gray-stack.h b/mono/sgen/sgen-marksweep-drain-gray-stack.h

index d6460111a0f4094c25b90ca38e8f2445e0d6edd8..6eb4d266fdc16d423a5ef48398c1cb7dec3e2a59 100644 (file)
--- a/mono/sgen/sgen-marksweep-drain-gray-stack.h
+++ b/mono/sgen/sgen-marksweep-drain-gray-stack.h
@@ -48,6 +48,7 @@ COPY_OR_MARK_FUNCTION_NAME (GCObject **ptr, GCObject *obj, SgenGrayQueue *queue)
         if (sgen_ptr_in_nursery (obj)) {
  #if !defined(COPY_OR_MARK_CONCURRENT) && !defined(COPY_OR_MARK_CONCURRENT_WITH_EVACUATION)
                 int word, bit;
+               gboolean first = TRUE;
                 GCObject *forwarded, *old_obj;
                 mword vtable_word = *(mword*)obj;
  
@@ -77,7 +78,11 @@ COPY_OR_MARK_FUNCTION_NAME (GCObject **ptr, GCObject *obj, SgenGrayQueue *queue)
         do_copy_object:
  #endif
                 old_obj = obj;
+#ifdef COPY_OR_MARK_PARALLEL
+               obj = copy_object_no_checks_par (obj, queue);
+#else
                 obj = copy_object_no_checks (obj, queue);
+#endif
                 if (G_UNLIKELY (old_obj == obj)) {
                         /*
                          * If we fail to evacuate an object we just stop doing it for a
@@ -116,8 +121,13 @@ COPY_OR_MARK_FUNCTION_NAME (GCObject **ptr, GCObject *obj, SgenGrayQueue *queue)
                 block = MS_BLOCK_FOR_OBJ (obj);
                 MS_CALC_MARK_BIT (word, bit, obj);
                 SGEN_ASSERT (9, !MS_MARK_BIT (block, word, bit), "object %p already marked", obj);
+#ifdef COPY_OR_MARK_PARALLEL
+               MS_SET_MARK_BIT_PAR (block, word, bit, first);
+#else
                 MS_SET_MARK_BIT (block, word, bit);
-               binary_protocol_mark (obj, (gpointer)SGEN_LOAD_VTABLE (obj), sgen_safe_object_get_size (obj));
+#endif
+               if (first)
+                       binary_protocol_mark (obj, (gpointer)SGEN_LOAD_VTABLE (obj), sgen_safe_object_get_size (obj));
  
                 return FALSE;
  #endif
@@ -174,17 +184,32 @@ COPY_OR_MARK_FUNCTION_NAME (GCObject **ptr, GCObject *obj, SgenGrayQueue *queue)
                         }
  #endif
  
+#ifdef COPY_OR_MARK_PARALLEL
+                       MS_MARK_OBJECT_AND_ENQUEUE_PAR (obj, desc, block, queue);
+#else
                         MS_MARK_OBJECT_AND_ENQUEUE (obj, desc, block, queue);
+#endif
                 } else {
+                       gboolean first = TRUE;
                         HEAVY_STAT (++stat_optimized_copy_major_large);
-
+#ifdef COPY_OR_MARK_PARALLEL
+                       first = sgen_los_pin_object_par (obj);
+#else
                         if (sgen_los_object_is_pinned (obj))
-                               return FALSE;
-                       binary_protocol_pin (obj, (gpointer)SGEN_LOAD_VTABLE (obj), sgen_safe_object_get_size (obj));
+                               first = FALSE;
+                       else
+                               sgen_los_pin_object (obj);
+#endif
  
-                       sgen_los_pin_object (obj);
-                       if (SGEN_OBJECT_HAS_REFERENCES (obj))
-                               GRAY_OBJECT_ENQUEUE (queue, obj, desc);
+                       if (first) {
+                               binary_protocol_pin (obj, (gpointer)SGEN_LOAD_VTABLE (obj), sgen_safe_object_get_size (obj));
+                               if (SGEN_OBJECT_HAS_REFERENCES (obj))
+#ifdef COPY_OR_MARK_PARALLEL
+                                       GRAY_OBJECT_ENQUEUE_PARALLEL (queue, obj, desc);
+#else
+                                       GRAY_OBJECT_ENQUEUE_SERIAL (queue, obj, desc);
+#endif
+                       }
                 }
                 return FALSE;
         }
@@ -292,7 +317,7 @@ SCAN_PTR_FIELD_FUNCTION_NAME (GCObject *full_object, GCObject **ptr, SgenGrayQue
  static gboolean
  DRAIN_GRAY_STACK_FUNCTION_NAME (SgenGrayQueue *queue)
  {
-#if defined(COPY_OR_MARK_CONCURRENT) || defined(COPY_OR_MARK_CONCURRENT_WITH_EVACUATION)
+#if defined(COPY_OR_MARK_CONCURRENT) || defined(COPY_OR_MARK_CONCURRENT_WITH_EVACUATION) || defined(COPY_OR_MARK_PARALLEL)
         int i;
         for (i = 0; i < 32; i++) {
  #else
@@ -303,7 +328,11 @@ DRAIN_GRAY_STACK_FUNCTION_NAME (SgenGrayQueue *queue)
  
                 HEAVY_STAT (++stat_drain_loops);
  
-               GRAY_OBJECT_DEQUEUE (queue, &obj, &desc);
+#if defined(COPY_OR_MARK_PARALLEL)
+               GRAY_OBJECT_DEQUEUE_PARALLEL (queue, &obj, &desc);
+#else
+               GRAY_OBJECT_DEQUEUE_SERIAL (queue, &obj, &desc);
+#endif
                 if (!obj)
                         return TRUE;
  
@@ -312,6 +341,7 @@ DRAIN_GRAY_STACK_FUNCTION_NAME (SgenGrayQueue *queue)
         return FALSE;
  }
  
+#undef COPY_OR_MARK_PARALLEL
  #undef COPY_OR_MARK_FUNCTION_NAME
  #undef COPY_OR_MARK_WITH_EVACUATION
  #undef COPY_OR_MARK_CONCURRENT
diff --git a/mono/sgen/sgen-marksweep.c b/mono/sgen/sgen-marksweep.c

index 8ae553c23b3d90687ef6ac7720a4e0d68127b155..b280749a46a3dd8f9d045cd6de5e6ae7ee4a7e16 100644 (file)
--- a/mono/sgen/sgen-marksweep.c
+++ b/mono/sgen/sgen-marksweep.c
@@ -57,7 +57,7 @@
  
  #define MS_BLOCK_FREE  (MS_BLOCK_SIZE - MS_BLOCK_SKIP)
  
-#define MS_NUM_MARK_WORDS      ((MS_BLOCK_SIZE / SGEN_ALLOC_ALIGN + sizeof (mword) * 8 - 1) / (sizeof (mword) * 8))
+#define MS_NUM_MARK_WORDS      (MS_BLOCK_SIZE / SGEN_ALLOC_ALIGN + sizeof (guint32) * 8 - 1) / (sizeof (guint32) * 8)
  
  /*
   * Blocks progress from one state to the next:
@@ -107,7 +107,7 @@ struct _MSBlockInfo {
         void ** volatile free_list;
         MSBlockInfo * volatile next_free;
         guint8 * volatile cardtable_mod_union;
-       mword mark_words [MS_NUM_MARK_WORDS];
+       guint32 mark_words [MS_NUM_MARK_WORDS];
  };
  
  #define MS_BLOCK_FOR_BLOCK_INFO(b)     ((char*)(b))
@@ -128,17 +128,26 @@ typedef struct {
  //casting to int is fine since blocks are 32k
  #define MS_CALC_MARK_BIT(w,b,o)        do {                            \
                 int i = ((int)((char*)(o) - MS_BLOCK_DATA_FOR_OBJ ((o)))) >> SGEN_ALLOC_ALIGN_BITS; \
-               if (sizeof (mword) == 4) {                              \
-                       (w) = i >> 5;                                   \
-                       (b) = i & 31;                                   \
-               } else {                                                \
-                       (w) = i >> 6;                                   \
-                       (b) = i & 63;                                   \
-               }                                                       \
+               (w) = i >> 5;                                           \
+               (b) = i & 31;                                           \
         } while (0)
  
  #define MS_MARK_BIT(bl,w,b)    ((bl)->mark_words [(w)] & (ONE_P << (b)))
  #define MS_SET_MARK_BIT(bl,w,b)        ((bl)->mark_words [(w)] |= (ONE_P << (b)))
+#define MS_SET_MARK_BIT_PAR(bl,w,b,first)      do {                    \
+               guint32 tmp_mark_word = (bl)->mark_words [(w)];         \
+               guint32 old_mark_word;                                  \
+               first = FALSE;                                          \
+               while (!(tmp_mark_word & (ONE_P << (b)))) {             \
+                       old_mark_word = tmp_mark_word;                  \
+                       tmp_mark_word = InterlockedCompareExchange ((volatile gint32*)&(bl)->mark_words [w], old_mark_word | (ONE_P << (b)), old_mark_word); \
+                       if (tmp_mark_word == old_mark_word) {           \
+                               first = TRUE;                           \
+                               break;                                  \
+                       }                                               \
+               }                                                       \
+       } while (0)
+
  
  #define MS_OBJ_ALLOCED(o,b)    (*(void**)(o) && (*(char**)(o) < MS_BLOCK_FOR_BLOCK_INFO (b) || *(char**)(o) >= MS_BLOCK_FOR_BLOCK_INFO (b) + MS_BLOCK_SIZE))
  
@@ -188,7 +197,7 @@ static gboolean concurrent_sweep = TRUE;
  #define BLOCK_TAG(bl)                          ((bl)->has_references ? BLOCK_TAG_HAS_REFERENCES ((bl)) : (bl))
  
  /* all allocated blocks in the system */
-static SgenArrayList allocated_blocks = SGEN_ARRAY_LIST_INIT (NULL, NULL, NULL, INTERNAL_MEM_PIN_QUEUE);
+static SgenArrayList allocated_blocks = SGEN_ARRAY_LIST_INIT (NULL, sgen_array_list_default_is_slot_set, sgen_array_list_default_cas_setter, INTERNAL_MEM_PIN_QUEUE);
  
  /* non-allocated block free-list */
  static void *empty_blocks = NULL;
@@ -235,6 +244,7 @@ static volatile size_t num_major_sections = 0;
   * thread only ever adds blocks to the free list, so the ABA problem can't occur.
   */
  static MSBlockInfo * volatile *free_block_lists [MS_BLOCK_TYPE_MAX];
+static MonoNativeTlsKey worker_block_free_list_key;
  
  static guint64 stat_major_blocks_alloced = 0;
  static guint64 stat_major_blocks_freed = 0;
@@ -284,6 +294,7 @@ ms_find_block_obj_size_index (size_t size)
  
  #define FREE_BLOCKS_FROM(lists,p,r)    (lists [((p) ? MS_BLOCK_FLAG_PINNED : 0) | ((r) ? MS_BLOCK_FLAG_REFS : 0)])
  #define FREE_BLOCKS(p,r)               (FREE_BLOCKS_FROM (free_block_lists, (p), (r)))
+#define FREE_BLOCKS_LOCAL(p,r)         (FREE_BLOCKS_FROM (((MSBlockInfo***)mono_native_tls_get_value (worker_block_free_list_key)), (p), (r)))
  
  #define MS_BLOCK_OBJ_SIZE_INDEX(s)                             \
         (((s)+7)>>3 < MS_NUM_FAST_BLOCK_OBJ_SIZE_INDEXES ?      \
@@ -682,6 +693,56 @@ major_alloc_object (GCVTable vtable, size_t size, gboolean has_references)
         return alloc_obj (vtable, size, FALSE, has_references);
  }
  
+/*
+ * This can only be called by sgen workers. While this is called we assume
+ * that no other thread is accessing the block free lists. The world should
+ * be stopped and the gc thread should be waiting for workers to finish.
+ */
+static GCObject*
+major_alloc_object_par (GCVTable vtable, size_t size, gboolean has_references)
+{
+       int size_index = MS_BLOCK_OBJ_SIZE_INDEX (size);
+       MSBlockInfo * volatile * free_blocks = FREE_BLOCKS (FALSE, has_references);
+       MSBlockInfo **free_blocks_local = FREE_BLOCKS_LOCAL (FALSE, has_references);
+       void *obj;
+
+       if (free_blocks_local [size_index]) {
+get_slot:
+               obj = unlink_slot_from_free_list_uncontested (free_blocks_local, size_index);
+       } else {
+               MSBlockInfo *block;
+get_block:
+               block = free_blocks [size_index];
+               if (!block) {
+                       if (G_UNLIKELY (!ms_alloc_block (size_index, FALSE, has_references)))
+                               return NULL;
+                       goto get_block;
+               } else {
+                       MSBlockInfo *next_free = block->next_free;
+                       /*
+                        * Once a block is removed from the main list, it cannot return on the list until
+                        * all the workers are finished and sweep is starting. This means we don't need
+                        * to account for ABA problems.
+                        */
+                       if (SGEN_CAS_PTR ((volatile gpointer *)&free_blocks [size_index], next_free, block) != block)
+                               goto get_block;
+                       g_assert (block->free_list);
+                       block->next_free = free_blocks_local [size_index];
+                       free_blocks_local [size_index] = block;
+
+                       goto get_slot;
+               }
+       }
+
+       /* FIXME: assumes object layout */
+       *(GCVTable*)obj = vtable;
+
+       /* FIXME is it worth CAS-ing here */
+       total_allocated_major += block_obj_sizes [size_index]; 
+
+       return (GCObject *)obj;
+}
+
  /*
   * We're not freeing the block if it's empty.  We leave that work for
   * the next major collection.
@@ -1084,12 +1145,27 @@ major_block_is_evacuating (MSBlockInfo *block)
                 if (!MS_MARK_BIT ((block), __word, __bit)) {            \
                         MS_SET_MARK_BIT ((block), __word, __bit);       \
                         if (sgen_gc_descr_has_references (desc))                        \
-                               GRAY_OBJECT_ENQUEUE ((queue), (obj), (desc)); \
+                               GRAY_OBJECT_ENQUEUE_SERIAL ((queue), (obj), (desc)); \
+                       binary_protocol_mark ((obj), (gpointer)SGEN_LOAD_VTABLE ((obj)), sgen_safe_object_get_size ((obj))); \
+                       INC_NUM_MAJOR_OBJECTS_MARKED ();                \
+               }                                                       \
+       } while (0)
+#define MS_MARK_OBJECT_AND_ENQUEUE_PAR(obj,desc,block,queue) do {      \
+               int __word, __bit;                                      \
+               gboolean first;                                         \
+               MS_CALC_MARK_BIT (__word, __bit, (obj));                \
+               SGEN_ASSERT (9, MS_OBJ_ALLOCED ((obj), (block)), "object %p not allocated", obj); \
+               MS_SET_MARK_BIT_PAR ((block), __word, __bit, first);    \
+               if (first) {                                            \
+                       if (sgen_gc_descr_has_references (desc))        \
+                               GRAY_OBJECT_ENQUEUE_PARALLEL ((queue), (obj), (desc)); \
                         binary_protocol_mark ((obj), (gpointer)SGEN_LOAD_VTABLE ((obj)), sgen_safe_object_get_size ((obj))); \
                         INC_NUM_MAJOR_OBJECTS_MARKED ();                \
                 }                                                       \
         } while (0)
  
+
+
  static void
  pin_major_object (GCObject *obj, SgenGrayQueue *queue)
  {
@@ -1103,6 +1179,7 @@ pin_major_object (GCObject *obj, SgenGrayQueue *queue)
         MS_MARK_OBJECT_AND_ENQUEUE (obj, sgen_obj_get_descriptor (obj), block, queue);
  }
  
+#define COPY_OR_MARK_PARALLEL
  #include "sgen-major-copy-object.h"
  
  static long long
@@ -1153,6 +1230,12 @@ static guint64 stat_drain_loops;
  #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_no_evacuation
  #include "sgen-marksweep-drain-gray-stack.h"
  
+#define COPY_OR_MARK_PARALLEL
+#define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_par_no_evacuation
+#define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_par_no_evacuation
+#define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_par_no_evacuation
+#include "sgen-marksweep-drain-gray-stack.h"
+
  #define COPY_OR_MARK_WITH_EVACUATION
  #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_with_evacuation
  #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_with_evacuation
@@ -1161,12 +1244,28 @@ static guint64 stat_drain_loops;
  #define SCAN_PTR_FIELD_FUNCTION_NAME   major_scan_ptr_field_with_evacuation
  #include "sgen-marksweep-drain-gray-stack.h"
  
+#define COPY_OR_MARK_PARALLEL
+#define COPY_OR_MARK_WITH_EVACUATION
+#define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_par_with_evacuation
+#define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_par_with_evacuation
+#define SCAN_VTYPE_FUNCTION_NAME       major_scan_vtype_par_with_evacuation
+#define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_par_with_evacuation
+#define SCAN_PTR_FIELD_FUNCTION_NAME   major_scan_ptr_field_par_with_evacuation
+#include "sgen-marksweep-drain-gray-stack.h"
+
  #define COPY_OR_MARK_CONCURRENT
  #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_no_evacuation
  #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_no_evacuation
  #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_no_evacuation
  #include "sgen-marksweep-drain-gray-stack.h"
  
+#define COPY_OR_MARK_PARALLEL
+#define COPY_OR_MARK_CONCURRENT
+#define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_par_no_evacuation
+#define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_par_no_evacuation
+#define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_par_no_evacuation
+#include "sgen-marksweep-drain-gray-stack.h"
+
  #define COPY_OR_MARK_CONCURRENT_WITH_EVACUATION
  #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_with_evacuation
  #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_with_evacuation
@@ -1175,6 +1274,15 @@ static guint64 stat_drain_loops;
  #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_with_evacuation
  #include "sgen-marksweep-drain-gray-stack.h"
  
+#define COPY_OR_MARK_PARALLEL
+#define COPY_OR_MARK_CONCURRENT_WITH_EVACUATION
+#define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_par_with_evacuation
+#define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_par_with_evacuation
+#define SCAN_VTYPE_FUNCTION_NAME       major_scan_vtype_concurrent_par_with_evacuation
+#define SCAN_PTR_FIELD_FUNCTION_NAME   major_scan_ptr_field_concurrent_par_with_evacuation
+#define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_par_with_evacuation
+#include "sgen-marksweep-drain-gray-stack.h"
+
  static inline gboolean
  major_is_evacuating (void)
  {
@@ -1197,6 +1305,15 @@ drain_gray_stack (SgenGrayQueue *queue)
                 return drain_gray_stack_no_evacuation (queue);
  }
  
+static gboolean
+drain_gray_stack_par (SgenGrayQueue *queue)
+{
+       if (major_is_evacuating ())
+               return drain_gray_stack_par_with_evacuation (queue);
+       else
+               return drain_gray_stack_par_no_evacuation (queue);
+}
+
  static gboolean
  drain_gray_stack_concurrent (SgenGrayQueue *queue)
  {
@@ -1206,6 +1323,15 @@ drain_gray_stack_concurrent (SgenGrayQueue *queue)
                 return drain_gray_stack_concurrent_no_evacuation (queue);
  }
  
+static gboolean
+drain_gray_stack_concurrent_par (SgenGrayQueue *queue)
+{
+       if (major_is_evacuating ())
+               return drain_gray_stack_concurrent_par_with_evacuation (queue);
+       else
+               return drain_gray_stack_concurrent_par_no_evacuation (queue);
+}
+
  static void
  major_copy_or_mark_object_canonical (GCObject **ptr, SgenGrayQueue *queue)
  {
@@ -1218,12 +1344,24 @@ major_copy_or_mark_object_concurrent_canonical (GCObject **ptr, SgenGrayQueue *q
         major_copy_or_mark_object_concurrent_with_evacuation (ptr, *ptr, queue);
  }
  
+static void
+major_copy_or_mark_object_concurrent_par_canonical (GCObject **ptr, SgenGrayQueue *queue)
+{
+       major_copy_or_mark_object_concurrent_par_with_evacuation (ptr, *ptr, queue);
+}
+
  static void
  major_copy_or_mark_object_concurrent_finish_canonical (GCObject **ptr, SgenGrayQueue *queue)
  {
         major_copy_or_mark_object_with_evacuation (ptr, *ptr, queue);
  }
  
+static void
+major_copy_or_mark_object_concurrent_par_finish_canonical (GCObject **ptr, SgenGrayQueue *queue)
+{
+       major_copy_or_mark_object_par_with_evacuation (ptr, *ptr, queue);
+}
+
  static void
  mark_pinned_objects_in_block (MSBlockInfo *block, size_t first_entry, size_t last_entry, SgenGrayQueue *queue)
  {
@@ -1356,7 +1494,7 @@ sweep_block (MSBlockInfo *block)
         }
  
         /* reset mark bits */
-       memset (block->mark_words, 0, sizeof (mword) * MS_NUM_MARK_WORDS);
+       memset (block->mark_words, 0, sizeof (guint32) * MS_NUM_MARK_WORDS);
  
         /* Reverse free list so that it's in address order */
         reversed = NULL;
@@ -1402,6 +1540,21 @@ static size_t *sweep_num_blocks;
  static volatile size_t num_major_sections_before_sweep;
  static volatile size_t num_major_sections_freed_in_sweep;
  
+static void
+sgen_worker_clear_free_block_lists (WorkerData *worker)
+{
+       int i, j;
+
+       if (!worker->free_block_lists)
+               return;
+
+       for (i = 0; i < MS_BLOCK_TYPE_MAX; i++) {
+               for (j = 0; j < num_block_obj_sizes; j++) {
+                       ((MSBlockInfo***) worker->free_block_lists) [i][j] = NULL;
+               }
+       }
+}
+
  static void
  sweep_start (void)
  {
@@ -1418,7 +1571,7 @@ sweep_start (void)
                         free_blocks [j] = NULL;
         }
  
-       sgen_array_list_remove_nulls (&allocated_blocks);
+       sgen_workers_foreach (sgen_worker_clear_free_block_lists);
  }
  
  static void sweep_finish (void);
@@ -1560,6 +1713,7 @@ ensure_block_is_checked_for_sweeping (guint32 block_index, gboolean wait, gboole
                 ms_free_block (block);
  
                 SGEN_ATOMIC_ADD_P (num_major_sections, -1);
+               SGEN_ATOMIC_ADD_P (num_major_sections_freed_in_sweep, 1);
  
                 tagged_block = NULL;
         }
@@ -1606,13 +1760,8 @@ sweep_job_func (void *thread_data_untyped, SgenThreadPoolJob *job)
          * cooperate with the sweep thread to finish sweeping, and they will traverse from
          * low to high, to avoid constantly colliding on the same blocks.
          */
-       for (block_index = num_blocks; block_index-- > 0;) {
-               /*
-                * The block might have been freed by another thread doing some checking
-                * work.
-                */
-               if (!ensure_block_is_checked_for_sweeping (block_index, TRUE, NULL))
-                       ++num_major_sections_freed_in_sweep;
+       for (block_index = allocated_blocks.next_slot; block_index-- > 0;) {
+               ensure_block_is_checked_for_sweeping (block_index, TRUE, NULL);
         }
  
         while (!try_set_sweep_state (SWEEP_STATE_COMPACTING, SWEEP_STATE_SWEEPING)) {
@@ -1681,8 +1830,6 @@ major_sweep (void)
  
         sweep_start ();
  
-       SGEN_ASSERT (0, num_major_sections == allocated_blocks.next_slot, "We don't know how many blocks we have?");
-
         num_major_sections_before_sweep = num_major_sections;
         num_major_sections_freed_in_sweep = 0;
  
@@ -2418,7 +2565,7 @@ scan_card_table_for_block (MSBlockInfo *block, CardTableScanType scan_type, Scan
  }
  
  static void
-major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx)
+major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx, int job_index, int job_split_count)
  {
         MSBlockInfo *block;
         gboolean has_references, was_sweeping, skip_scan;
@@ -2432,8 +2579,10 @@ major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx)
  
         binary_protocol_major_card_table_scan_start (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
         FOREACH_BLOCK_HAS_REFERENCES_NO_LOCK (block, has_references) {
+               if (__index % job_split_count != job_index)
+                       continue;
  #ifdef PREFETCH_CARDS
-               int prefetch_index = __index + 6;
+               int prefetch_index = __index + 6 * job_split_count;
                 if (prefetch_index < allocated_blocks.next_slot) {
                         MSBlockInfo *prefetch_block = BLOCK_UNTAG (*sgen_array_list_get_slot (&allocated_blocks, prefetch_index));
                         PREFETCH_READ (prefetch_block);
@@ -2546,8 +2695,24 @@ post_param_init (SgenMajorCollector *collector)
         collector->needs_thread_pool = concurrent_mark || concurrent_sweep;
  }
  
+/* We are guaranteed to be called by the worker in question */
+static void
+sgen_worker_init_callback (gpointer worker_untyped)
+{
+       int i;
+       WorkerData *worker = (WorkerData*) worker_untyped;
+       MSBlockInfo ***worker_free_blocks = (MSBlockInfo ***) sgen_alloc_internal_dynamic (sizeof (MSBlockInfo**) * MS_BLOCK_TYPE_MAX, INTERNAL_MEM_MS_TABLES, TRUE);
+
+       for (i = 0; i < MS_BLOCK_TYPE_MAX; i++)
+               worker_free_blocks [i] = (MSBlockInfo **) sgen_alloc_internal_dynamic (sizeof (MSBlockInfo*) * num_block_obj_sizes, INTERNAL_MEM_MS_TABLES, TRUE);
+
+       worker->free_block_lists = worker_free_blocks;
+
+       mono_native_tls_set_value (worker_block_free_list_key, worker_free_blocks);
+}
+
  static void
-sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurrent)
+sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurrent, gboolean is_parallel)
  {
         int i;
  
@@ -2594,6 +2759,7 @@ sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurr
  
         concurrent_mark = is_concurrent;
         collector->is_concurrent = is_concurrent;
+       collector->is_parallel = is_parallel;
         collector->needs_thread_pool = is_concurrent || concurrent_sweep;
         collector->get_and_reset_num_major_objects_marked = major_get_and_reset_num_major_objects_marked;
         collector->supports_cardtable = TRUE;
@@ -2604,6 +2770,7 @@ sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurr
         collector->alloc_degraded = major_alloc_degraded;
  
         collector->alloc_object = major_alloc_object;
+       collector->alloc_object_par = major_alloc_object_par;
         collector->free_pinned_object = free_pinned_object;
         collector->iterate_objects = major_iterate_objects;
         collector->free_non_pinned_object = major_free_non_pinned_object;
@@ -2655,6 +2822,24 @@ sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurr
                 collector->major_ops_concurrent_finish.scan_vtype = major_scan_vtype_with_evacuation;
                 collector->major_ops_concurrent_finish.scan_ptr_field = major_scan_ptr_field_with_evacuation;
                 collector->major_ops_concurrent_finish.drain_gray_stack = drain_gray_stack;
+
+               if (is_parallel) {
+                       collector->major_ops_conc_par_start.copy_or_mark_object = major_copy_or_mark_object_concurrent_par_canonical;
+                       collector->major_ops_conc_par_start.scan_object = major_scan_object_concurrent_par_with_evacuation;
+                       collector->major_ops_conc_par_start.scan_vtype = major_scan_vtype_concurrent_par_with_evacuation;
+                       collector->major_ops_conc_par_start.scan_ptr_field = major_scan_ptr_field_concurrent_par_with_evacuation;
+                       collector->major_ops_conc_par_start.drain_gray_stack = drain_gray_stack_concurrent_par;
+
+                       collector->major_ops_conc_par_finish.copy_or_mark_object = major_copy_or_mark_object_concurrent_par_finish_canonical;
+                       collector->major_ops_conc_par_finish.scan_object = major_scan_object_par_with_evacuation;
+                       collector->major_ops_conc_par_finish.scan_vtype = major_scan_vtype_par_with_evacuation;
+                       collector->major_ops_conc_par_finish.scan_ptr_field = major_scan_ptr_field_par_with_evacuation;
+                       collector->major_ops_conc_par_finish.drain_gray_stack = drain_gray_stack_par;
+
+                       collector->worker_init_cb = sgen_worker_init_callback;
+
+                       mono_native_tls_alloc (&worker_block_free_list_key, NULL);
+               }
         }
  
  #ifdef HEAVY_STATISTICS
@@ -2688,13 +2873,19 @@ sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurr
  void
  sgen_marksweep_init (SgenMajorCollector *collector)
  {
-       sgen_marksweep_init_internal (collector, FALSE);
+       sgen_marksweep_init_internal (collector, FALSE, FALSE);
  }
  
  void
  sgen_marksweep_conc_init (SgenMajorCollector *collector)
  {
-       sgen_marksweep_init_internal (collector, TRUE);
+       sgen_marksweep_init_internal (collector, TRUE, FALSE);
+}
+
+void
+sgen_marksweep_conc_par_init (SgenMajorCollector *collector)
+{
+       sgen_marksweep_init_internal (collector, TRUE, TRUE);
  }
  
  #endif
diff --git a/mono/sgen/sgen-minor-scan-object.h b/mono/sgen/sgen-minor-scan-object.h

index cc3e7a4539e794037d54c6055196b2d9df815fef..2c357c3c9524c7d3e06b90cfb0aa354918f6e669 100644 (file)
--- a/mono/sgen/sgen-minor-scan-object.h
+++ b/mono/sgen/sgen-minor-scan-object.h
@@ -13,6 +13,7 @@ extern guint64 stat_scan_object_called_nursery;
  #undef SERIAL_SCAN_OBJECT
  #undef SERIAL_SCAN_VTYPE
  #undef SERIAL_SCAN_PTR_FIELD
+#undef SERIAL_DRAIN_GRAY_STACK
  
  #if defined(SGEN_SIMPLE_NURSERY)
  
@@ -20,10 +21,12 @@ extern guint64 stat_scan_object_called_nursery;
  #define SERIAL_SCAN_OBJECT simple_nursery_serial_with_concurrent_major_scan_object
  #define SERIAL_SCAN_VTYPE simple_nursery_serial_with_concurrent_major_scan_vtype
  #define SERIAL_SCAN_PTR_FIELD simple_nursery_serial_with_concurrent_major_scan_ptr_field
+#define SERIAL_DRAIN_GRAY_STACK simple_nursery_serial_with_concurrent_major_drain_gray_stack
  #else
  #define SERIAL_SCAN_OBJECT simple_nursery_serial_scan_object
  #define SERIAL_SCAN_VTYPE simple_nursery_serial_scan_vtype
  #define SERIAL_SCAN_PTR_FIELD simple_nursery_serial_scan_ptr_field
+#define SERIAL_DRAIN_GRAY_STACK simple_nursery_serial_drain_gray_stack
  #endif
  
  #elif defined (SGEN_SPLIT_NURSERY)
@@ -32,10 +35,12 @@ extern guint64 stat_scan_object_called_nursery;
  #define SERIAL_SCAN_OBJECT split_nursery_serial_with_concurrent_major_scan_object
  #define SERIAL_SCAN_VTYPE split_nursery_serial_with_concurrent_major_scan_vtype
  #define SERIAL_SCAN_PTR_FIELD split_nursery_serial_with_concurrent_major_scan_ptr_field
+#define SERIAL_DRAIN_GRAY_STACK split_nursery_serial_with_concurrent_major_drain_gray_stack
  #else
  #define SERIAL_SCAN_OBJECT split_nursery_serial_scan_object
  #define SERIAL_SCAN_VTYPE split_nursery_serial_scan_vtype
  #define SERIAL_SCAN_PTR_FIELD split_nursery_serial_scan_ptr_field
+#define SERIAL_DRAIN_GRAY_STACK split_nursery_serial_drain_gray_stack
  #endif
  
  #else
@@ -95,8 +100,24 @@ SERIAL_SCAN_PTR_FIELD (GCObject *full_object, GCObject **ptr, SgenGrayQueue *que
         HANDLE_PTR (ptr, NULL);
  }
  
+static gboolean
+SERIAL_DRAIN_GRAY_STACK (SgenGrayQueue *queue)
+{
+        for (;;) {
+                GCObject *obj;
+                SgenDescriptor desc;
+
+                GRAY_OBJECT_DEQUEUE_SERIAL (queue, &obj, &desc);
+                if (!obj)
+                        return TRUE;
+
+                SERIAL_SCAN_OBJECT (obj, desc, queue);
+        }
+}
+
  #define FILL_MINOR_COLLECTOR_SCAN_OBJECT(ops)  do {                    \
                 (ops)->scan_object = SERIAL_SCAN_OBJECT;                        \
                 (ops)->scan_vtype = SERIAL_SCAN_VTYPE;                  \
                 (ops)->scan_ptr_field = SERIAL_SCAN_PTR_FIELD;          \
+               (ops)->drain_gray_stack = SERIAL_DRAIN_GRAY_STACK;      \
         } while (0)
diff --git a/mono/sgen/sgen-nursery-allocator.c b/mono/sgen/sgen-nursery-allocator.c

index 785f46d5645c183043c8f09d047f4f2f3e7efeca..77a3aa866f743573c2c04da3c52352506d970e0a 100644 (file)
--- a/mono/sgen/sgen-nursery-allocator.c
+++ b/mono/sgen/sgen-nursery-allocator.c
@@ -742,7 +742,7 @@ sgen_build_nursery_fragments (GCMemSection *nursery_section, SgenGrayQueue *unpi
  
                 if (addr0 < addr1) {
                         if (unpin_queue)
-                               GRAY_OBJECT_ENQUEUE (unpin_queue, (GCObject*)addr0, sgen_obj_get_descriptor_safe ((GCObject*)addr0));
+                               GRAY_OBJECT_ENQUEUE_SERIAL (unpin_queue, (GCObject*)addr0, sgen_obj_get_descriptor_safe ((GCObject*)addr0));
                         else
                                 SGEN_UNPIN_OBJECT (addr0);
                         size = SGEN_ALIGN_UP (sgen_safe_object_get_size ((GCObject*)addr0));
diff --git a/mono/sgen/sgen-pinning.c b/mono/sgen/sgen-pinning.c

index 6b2b2e33547f2c2009616a0d628c00a4fd6fa14b..be803daffa7a53f3d09596b1a0b51c4261620033 100644 (file)
--- a/mono/sgen/sgen-pinning.c
+++ b/mono/sgen/sgen-pinning.c
@@ -321,8 +321,11 @@ sgen_cement_lookup_or_register (GCObject *obj)
         SGEN_ASSERT (5, sgen_ptr_in_nursery (obj), "Can only cement pointers to nursery objects");
  
         if (!hash [i].obj) {
-               SGEN_ASSERT (5, !hash [i].count, "Cementing hash inconsistent");
-               hash [i].obj = obj;
+               GCObject *old_obj;
+               old_obj = InterlockedCompareExchangePointer ((gpointer*)&hash [i].obj, obj, NULL);
+               /* Check if the slot was occupied by some other object */
+               if (old_obj != NULL && old_obj != obj)
+                       return FALSE;
         } else if (hash [i].obj != obj) {
                 return FALSE;
         }
@@ -330,8 +333,7 @@ sgen_cement_lookup_or_register (GCObject *obj)
         if (hash [i].count >= SGEN_CEMENT_THRESHOLD)
                 return TRUE;
  
-       ++hash [i].count;
-       if (hash [i].count == SGEN_CEMENT_THRESHOLD) {
+       if (InterlockedIncrement ((gint32*)&hash [i].count) == SGEN_CEMENT_THRESHOLD) {
                 SGEN_ASSERT (9, sgen_get_current_collection_generation () >= 0, "We can only cement objects when we're in a collection pause.");
                 SGEN_ASSERT (9, SGEN_OBJECT_IS_PINNED (obj), "Can only cement pinned objects");
                 SGEN_CEMENT_OBJECT (obj);
diff --git a/mono/sgen/sgen-protocol.c b/mono/sgen/sgen-protocol.c

index a399aa088da2fbe59635d443d777a79200698986..66e80d97e55483c544d38ed56214085926010149 100644 (file)
--- a/mono/sgen/sgen-protocol.c
+++ b/mono/sgen/sgen-protocol.c
@@ -340,30 +340,37 @@ static void
  protocol_entry (unsigned char type, gpointer data, int size)
  {
         int index;
+       gboolean include_worker_index = type != PROTOCOL_ID (binary_protocol_header);
+       int entry_size = size + 1 + (include_worker_index ? 1 : 0); // type + worker_index + size
         BinaryProtocolBuffer *buffer;
  
         if (binary_protocol_file == invalid_file_value)
                 return;
  
-       if (sgen_thread_pool_is_thread_pool_thread (mono_native_thread_id_get ()))
-               type |= 0x80;
-
         lock_recursive ();
  
   retry:
         buffer = binary_protocol_get_buffer (size + 1);
   retry_same_buffer:
         index = buffer->index;
-       if (index + 1 + size > BINARY_PROTOCOL_BUFFER_SIZE)
+       if (index + entry_size > BINARY_PROTOCOL_BUFFER_SIZE)
                 goto retry;
  
-       if (InterlockedCompareExchange (&buffer->index, index + 1 + size, index) != index)
+       if (InterlockedCompareExchange (&buffer->index, index + entry_size, index) != index)
                 goto retry_same_buffer;
  
         /* FIXME: if we're interrupted at this point, we have a buffer
            entry that contains random data. */
  
         buffer->buffer [index++] = type;
+       /* We should never change the header format */
+       if (include_worker_index) {
+               /*
+                * If the thread is not a worker thread we insert 0, which is interpreted
+                * as gc thread. Worker indexes are 1 based.
+                */
+               buffer->buffer [index++] = (unsigned char) sgen_thread_pool_is_thread_pool_thread (mono_native_thread_id_get ());
+       }
         memcpy (buffer->buffer + index, data, size);
         index += size;
  
diff --git a/mono/sgen/sgen-protocol.h b/mono/sgen/sgen-protocol.h

index 3fe848878ec8e212ef60bf6ab411e5b3f3306706..b0892f4478fd09effd1ee58ce2c7e2db5bbe3e9b 100644 (file)
--- a/mono/sgen/sgen-protocol.h
+++ b/mono/sgen/sgen-protocol.h
@@ -15,7 +15,13 @@
  #include "sgen-gc.h"
  
  #define PROTOCOL_HEADER_CHECK 0xde7ec7ab1ec0de
-#define PROTOCOL_HEADER_VERSION 1
+/*
+ * The version needs to be bumped every time we introduce breaking changes (like
+ * adding new protocol entries or various format changes). The latest protocol grepper
+ * should be able to handle all the previous versions, while an old grepper will
+ * be able to tell if it cannot handle the format.
+ */
+#define PROTOCOL_HEADER_VERSION 2
  
  /* Special indices returned by MATCH_INDEX. */
  #define BINARY_PROTOCOL_NO_MATCH (-1)
diff --git a/mono/sgen/sgen-thread-pool.c b/mono/sgen/sgen-thread-pool.c

index f1664f6a23fcb7fd727fe533a45cea606d00134b..20c5e17d3d7464fc90ac156b0efdaeb3d45b8b6a 100644 (file)
--- a/mono/sgen/sgen-thread-pool.c
+++ b/mono/sgen/sgen-thread-pool.c
@@ -17,11 +17,14 @@
  #include "mono/utils/mono-threads.h"
  #endif
  
+#define MAX_NUM_THREADS 8
+
  static mono_mutex_t lock;
  static mono_cond_t work_cond;
  static mono_cond_t done_cond;
  
-static MonoNativeThreadId thread;
+static int threads_num = 0;
+static MonoNativeThreadId threads [MAX_NUM_THREADS];
  
  /* Only accessed with the lock held. */
  static SgenPointerQueue job_queue;
@@ -29,9 +32,10 @@ static SgenPointerQueue job_queue;
  static SgenThreadPoolThreadInitFunc thread_init_func;
  static SgenThreadPoolIdleJobFunc idle_job_func;
  static SgenThreadPoolContinueIdleJobFunc continue_idle_job_func;
+static SgenThreadPoolShouldWorkFunc should_work_func;
  
  static volatile gboolean threadpool_shutdown;
-static volatile gboolean thread_finished;
+static volatile int threads_finished = 0;
  
  enum {
         STATE_WAITING,
@@ -78,11 +82,19 @@ remove_job (SgenThreadPoolJob *job)
  }
  
  static gboolean
-continue_idle_job (void)
+continue_idle_job (void *thread_data)
  {
         if (!continue_idle_job_func)
                 return FALSE;
-       return continue_idle_job_func ();
+       return continue_idle_job_func (thread_data);
+}
+
+static gboolean
+should_work (void *thread_data)
+{
+       if (!should_work_func)
+               return TRUE;
+       return should_work_func (thread_data);
  }
  
  static mono_native_thread_return_t
@@ -92,14 +104,21 @@ thread_func (void *thread_data)
  
         mono_os_mutex_lock (&lock);
         for (;;) {
+               gboolean do_idle;
+               SgenThreadPoolJob *job;
+
+               if (!should_work (thread_data)) {
+                       mono_os_cond_wait (&work_cond, &lock);
+                       continue;
+               }
                 /*
                  * It's important that we check the continue idle flag with the lock held.
                  * Suppose we didn't check with the lock held, and the result is FALSE.  The
                  * main thread might then set continue idle and signal us before we can take
                  * the lock, and we'd lose the signal.
                  */
-               gboolean do_idle = continue_idle_job ();
-               SgenThreadPoolJob *job = get_job_and_set_in_progress ();
+               do_idle = continue_idle_job (thread_data);
+               job = get_job_and_set_in_progress ();
  
                 if (!job && !do_idle && !threadpool_shutdown) {
                         /*
@@ -130,7 +149,7 @@ thread_func (void *thread_data)
                         SGEN_ASSERT (0, idle_job_func, "Why do we have idle work when there's no idle job function?");
                         do {
                                 idle_job_func (thread_data);
-                               do_idle = continue_idle_job ();
+                               do_idle = continue_idle_job (thread_data);
                         } while (do_idle && !job_queue.next_slot);
  
                         mono_os_mutex_lock (&lock);
@@ -140,7 +159,7 @@ thread_func (void *thread_data)
                 } else {
                         SGEN_ASSERT (0, threadpool_shutdown, "Why did we unlock if no jobs and not shutting down?");
                         mono_os_mutex_lock (&lock);
-                       thread_finished = TRUE;
+                       threads_finished++;
                         mono_os_cond_signal (&done_cond);
                         mono_os_mutex_unlock (&lock);
                         return 0;
@@ -151,9 +170,11 @@ thread_func (void *thread_data)
  }
  
  void
-sgen_thread_pool_init (int num_threads, SgenThreadPoolThreadInitFunc init_func, SgenThreadPoolIdleJobFunc idle_func, SgenThreadPoolContinueIdleJobFunc continue_idle_func, void **thread_datas)
+sgen_thread_pool_init (int num_threads, SgenThreadPoolThreadInitFunc init_func, SgenThreadPoolIdleJobFunc idle_func, SgenThreadPoolContinueIdleJobFunc continue_idle_func, SgenThreadPoolShouldWorkFunc should_work_func_p, void **thread_datas)
  {
-       SGEN_ASSERT (0, num_threads == 1, "We only support 1 thread pool thread for now.");
+       int i;
+
+       threads_num = (num_threads < MAX_NUM_THREADS) ? num_threads : MAX_NUM_THREADS;
  
         mono_os_mutex_init (&lock);
         mono_os_cond_init (&work_cond);
@@ -162,20 +183,22 @@ sgen_thread_pool_init (int num_threads, SgenThreadPoolThreadInitFunc init_func,
         thread_init_func = init_func;
         idle_job_func = idle_func;
         continue_idle_job_func = continue_idle_func;
+       should_work_func = should_work_func_p;
  
-       mono_native_thread_create (&thread, thread_func, thread_datas ? thread_datas [0] : NULL);
+       for (i = 0; i < threads_num; i++)
+               mono_native_thread_create (&threads [i], thread_func, thread_datas ? thread_datas [i] : NULL);
  }
  
  void
  sgen_thread_pool_shutdown (void)
  {
-       if (!thread)
+       if (!threads_num)
                 return;
  
         mono_os_mutex_lock (&lock);
         threadpool_shutdown = TRUE;
-       mono_os_cond_signal (&work_cond);
-       while (!thread_finished)
+       mono_os_cond_broadcast (&work_cond);
+       while (threads_finished < threads_num)
                 mono_os_cond_wait (&done_cond, &lock);
         mono_os_mutex_unlock (&lock);
  
@@ -207,10 +230,6 @@ sgen_thread_pool_job_enqueue (SgenThreadPoolJob *job)
         mono_os_mutex_lock (&lock);
  
         sgen_pointer_queue_add (&job_queue, job);
-       /*
-        * FIXME: We could check whether there is a job in progress.  If there is, there's
-        * no need to signal the condition, at least as long as we have only one thread.
-        */
         mono_os_cond_signal (&work_cond);
  
         mono_os_mutex_unlock (&lock);
@@ -236,8 +255,8 @@ sgen_thread_pool_idle_signal (void)
  
         mono_os_mutex_lock (&lock);
  
-       if (continue_idle_job_func ())
-               mono_os_cond_signal (&work_cond);
+       if (continue_idle_job_func (NULL))
+               mono_os_cond_broadcast (&work_cond);
  
         mono_os_mutex_unlock (&lock);
  }
@@ -249,7 +268,7 @@ sgen_thread_pool_idle_wait (void)
  
         mono_os_mutex_lock (&lock);
  
-       while (continue_idle_job_func ())
+       while (continue_idle_job_func (NULL))
                 mono_os_cond_wait (&done_cond, &lock);
  
         mono_os_mutex_unlock (&lock);
@@ -266,10 +285,18 @@ sgen_thread_pool_wait_for_all_jobs (void)
         mono_os_mutex_unlock (&lock);
  }
  
-gboolean
+/* Return 0 if is not a thread pool thread or the thread number otherwise */
+int
  sgen_thread_pool_is_thread_pool_thread (MonoNativeThreadId some_thread)
  {
-       return some_thread == thread;
+       int i;
+
+       for (i = 0; i < threads_num; i++) {
+               if (some_thread == threads [i])
+                       return i + 1;
+       }
+
+       return 0;
  }
  
  #endif
diff --git a/mono/sgen/sgen-thread-pool.h b/mono/sgen/sgen-thread-pool.h

index 339526ca59876dac1ca4422a818e67421d68ee47..329cc148c1c8bb966488c6c2ba6ea370e49ad393 100644 (file)
--- a/mono/sgen/sgen-thread-pool.h
+++ b/mono/sgen/sgen-thread-pool.h
@@ -22,9 +22,10 @@ struct _SgenThreadPoolJob {
  
  typedef void (*SgenThreadPoolThreadInitFunc) (void*);
  typedef void (*SgenThreadPoolIdleJobFunc) (void*);
-typedef gboolean (*SgenThreadPoolContinueIdleJobFunc) (void);
+typedef gboolean (*SgenThreadPoolContinueIdleJobFunc) (void*);
+typedef gboolean (*SgenThreadPoolShouldWorkFunc) (void*);
  
-void sgen_thread_pool_init (int num_threads, SgenThreadPoolThreadInitFunc init_func, SgenThreadPoolIdleJobFunc idle_func, SgenThreadPoolContinueIdleJobFunc continue_idle_func, void **thread_datas);
+void sgen_thread_pool_init (int num_threads, SgenThreadPoolThreadInitFunc init_func, SgenThreadPoolIdleJobFunc idle_func, SgenThreadPoolContinueIdleJobFunc continue_idle_func, SgenThreadPoolShouldWorkFunc should_work_func, void **thread_datas);
  
  void sgen_thread_pool_shutdown (void);
  
@@ -41,6 +42,6 @@ void sgen_thread_pool_idle_wait (void);
  
  void sgen_thread_pool_wait_for_all_jobs (void);
  
-gboolean sgen_thread_pool_is_thread_pool_thread (MonoNativeThreadId thread);
+int sgen_thread_pool_is_thread_pool_thread (MonoNativeThreadId thread);
  
  #endif
diff --git a/mono/sgen/sgen-workers.c b/mono/sgen/sgen-workers.c

index 76eebdd86edfe066f5bfbdf1c6c51d6ad562372e..cc1930bcc2f443a65b4c536208051e89a119e94f 100644 (file)
--- a/mono/sgen/sgen-workers.c
+++ b/mono/sgen/sgen-workers.c
@@ -20,8 +20,20 @@
  #include "mono/sgen/sgen-client.h"
  
  static int workers_num;
+static int active_workers_num;
  static volatile gboolean forced_stop;
  static WorkerData *workers_data;
+static SgenWorkerCallback worker_init_cb;
+
+/*
+ * When using multiple workers, we need to have the last worker
+ * enqueue the preclean jobs (if there are any). This lock ensures
+ * that when the last worker takes it, all the other workers have
+ * gracefully finished, so it can restart them.
+ */
+static mono_mutex_t finished_lock;
+static volatile gboolean workers_finished;
+static int worker_awakenings;
  
  static SgenSectionGrayQueue workers_distribute_gray_queue;
  static gboolean workers_distribute_gray_queue_inited;
@@ -31,8 +43,8 @@ static gboolean workers_distribute_gray_queue_inited;
   *
   * | from \ to          | NOT WORKING | WORKING | WORK ENQUEUED |
   * |--------------------+-------------+---------+---------------+
- * | NOT WORKING        | -           | -       | main          |
- * | WORKING            | worker      | -       | main          |
+ * | NOT WORKING        | -           | -       | main / worker |
+ * | WORKING            | worker      | -       | main / worker |
   * | WORK ENQUEUED      | -           | worker  | -             |
   *
   * The WORK ENQUEUED state guarantees that the worker thread will inspect the queue again at
@@ -50,15 +62,18 @@ enum {
  
  typedef gint32 State;
  
-static volatile State workers_state;
-
  static SgenObjectOperations * volatile idle_func_object_ops;
-static SgenThreadPoolJob * volatile preclean_job;
+static SgenObjectOperations *idle_func_object_ops_par, *idle_func_object_ops_nopar;
+/*
+ * finished_callback is called only when the workers finish work normally (when they
+ * are not forced to finish). The callback is used to enqueue preclean jobs.
+ */
+static volatile SgenWorkersFinishCallback finish_callback;
  
  static guint64 stat_workers_num_finished;
  
  static gboolean
-set_state (State old_state, State new_state)
+set_state (WorkerData *data, State old_state, State new_state)
  {
         SGEN_ASSERT (0, old_state != new_state, "Why are we transitioning to the same state?");
         if (new_state == STATE_NOT_WORKING)
@@ -68,7 +83,7 @@ set_state (State old_state, State new_state)
         if (new_state == STATE_NOT_WORKING || new_state == STATE_WORKING)
                 SGEN_ASSERT (6, sgen_thread_pool_is_thread_pool_thread (mono_native_thread_id_get ()), "Only the worker thread is allowed to transition to NOT_WORKING or WORKING");
  
-       return InterlockedCompareExchange (&workers_state, new_state, old_state) == old_state;
+       return InterlockedCompareExchange (&data->state, new_state, old_state) == old_state;
  }
  
  static gboolean
@@ -80,19 +95,36 @@ state_is_working_or_enqueued (State state)
  static void
  sgen_workers_ensure_awake (void)
  {
-       State old_state;
-       gboolean did_set_state;
+       int i;
+       gboolean need_signal = FALSE;
  
-       do {
-               old_state = workers_state;
+       /*
+        * All workers are awaken, make sure we reset the parallel context.
+        * We call this function only when starting the workers so nobody is running,
+        * or when the last worker is enqueuing preclean work. In both cases we can't
+        * have a worker working using a nopar context, which means it is safe.
+        */
+       idle_func_object_ops = (active_workers_num > 1) ? idle_func_object_ops_par : idle_func_object_ops_nopar;
+       workers_finished = FALSE;
  
-               if (old_state == STATE_WORK_ENQUEUED)
-                       break;
+       for (i = 0; i < active_workers_num; i++) {
+               State old_state;
+               gboolean did_set_state;
+
+               do {
+                       old_state = workers_data [i].state;
+
+                       if (old_state == STATE_WORK_ENQUEUED)
+                               break;
  
-               did_set_state = set_state (old_state, STATE_WORK_ENQUEUED);
-       } while (!did_set_state);
+                       did_set_state = set_state (&workers_data [i], old_state, STATE_WORK_ENQUEUED);
+               } while (!did_set_state);
  
-       if (!state_is_working_or_enqueued (old_state))
+               if (!state_is_working_or_enqueued (old_state))
+                       need_signal = TRUE;
+       }
+
+       if (need_signal)
                 sgen_thread_pool_idle_signal ();
  }
  
@@ -100,23 +132,60 @@ static void
  worker_try_finish (WorkerData *data)
  {
         State old_state;
+       int i, working = 0;
  
         ++stat_workers_num_finished;
  
+       mono_os_mutex_lock (&finished_lock);
+
+       for (i = 0; i < active_workers_num; i++) {
+               if (state_is_working_or_enqueued (workers_data [i].state))
+                       working++;
+       }
+
+       if (working == 1) {
+               SgenWorkersFinishCallback callback = finish_callback;
+               SGEN_ASSERT (0, idle_func_object_ops == idle_func_object_ops_nopar, "Why are we finishing with parallel context");
+               /* We are the last one left. Enqueue preclean job if we have one and awake everybody */
+               SGEN_ASSERT (0, data->state != STATE_NOT_WORKING, "How did we get from doing idle work to NOT WORKING without setting it ourselves?");
+               if (callback) {
+                       finish_callback = NULL;
+                       callback ();
+                       worker_awakenings = 0;
+                       /* Make sure each worker has a chance of seeing the enqueued jobs */
+                       sgen_workers_ensure_awake ();
+                       SGEN_ASSERT (0, data->state == STATE_WORK_ENQUEUED, "Why did we fail to set our own state to ENQUEUED");
+                       goto work_available;
+               }
+       }
+
         do {
-               old_state = workers_state;
+               old_state = data->state;
  
                 SGEN_ASSERT (0, old_state != STATE_NOT_WORKING, "How did we get from doing idle work to NOT WORKING without setting it ourselves?");
                 if (old_state == STATE_WORK_ENQUEUED)
-                       return;
+                       goto work_available;
                 SGEN_ASSERT (0, old_state == STATE_WORKING, "What other possibility is there?");
+       } while (!set_state (data, old_state, STATE_NOT_WORKING));
  
-               /* We are the last thread to go to sleep. */
-       } while (!set_state (old_state, STATE_NOT_WORKING));
+       /*
+        * If we are second to last to finish, we set the scan context to the non-parallel
+        * version so we can speed up the last worker. This helps us maintain same level
+        * of performance as non-parallel mode even if we fail to distribute work properly.
+        */
+       if (working == 2)
+               idle_func_object_ops = idle_func_object_ops_nopar;
+
+       workers_finished = TRUE;
+       mono_os_mutex_unlock (&finished_lock);
  
         binary_protocol_worker_finish (sgen_timestamp (), forced_stop);
  
         sgen_gray_object_queue_trim_free_list (&data->private_gray_queue);
+       return;
+
+work_available:
+       mono_os_mutex_unlock (&finished_lock);
  }
  
  void
@@ -131,20 +200,6 @@ sgen_workers_enqueue_job (SgenThreadPoolJob *job, gboolean enqueue)
         sgen_thread_pool_job_enqueue (job);
  }
  
-void
-sgen_workers_wait_for_jobs_finished (void)
-{
-       sgen_thread_pool_wait_for_all_jobs ();
-       /*
-        * If the idle task was never triggered or it finished before the last job did and
-        * then didn't get triggered again, we might end up in the situation of having
-        * something in the gray queue yet the idle task not working.  The easiest way to
-        * make sure this doesn't stay that way is to just trigger it again after all jobs
-        * have finished.
-        */
-       sgen_workers_ensure_awake ();
-}
-
  static gboolean
  workers_get_work (WorkerData *data)
  {
@@ -157,7 +212,7 @@ workers_get_work (WorkerData *data)
         if (major->is_concurrent) {
                 GrayQueueSection *section = sgen_section_gray_queue_dequeue (&workers_distribute_gray_queue);
                 if (section) {
-                       sgen_gray_object_enqueue_section (&data->private_gray_queue, section);
+                       sgen_gray_object_enqueue_section (&data->private_gray_queue, section, major->is_parallel);
                         return TRUE;
                 }
         }
@@ -167,6 +222,37 @@ workers_get_work (WorkerData *data)
         return FALSE;
  }
  
+static gboolean
+workers_steal_work (WorkerData *data)
+{
+       SgenMajorCollector *major = sgen_get_major_collector ();
+       GrayQueueSection *section = NULL;
+       int i, current_worker;
+
+       if (!major->is_parallel)
+               return FALSE;
+
+       /* If we're parallel, steal from other workers' private gray queues  */
+       g_assert (sgen_gray_object_queue_is_empty (&data->private_gray_queue));
+
+       current_worker = (int) (data - workers_data);
+
+       for (i = 1; i < active_workers_num && !section; i++) {
+               int steal_worker = (current_worker + i) % active_workers_num;
+               if (state_is_working_or_enqueued (workers_data [steal_worker].state))
+                       section = sgen_gray_object_steal_section (&workers_data [steal_worker].private_gray_queue);
+       }
+
+       if (section) {
+               sgen_gray_object_enqueue_section (&data->private_gray_queue, section, TRUE);
+               return TRUE;
+       }
+
+       /* Nobody to steal from */
+       g_assert (sgen_gray_object_queue_is_empty (&data->private_gray_queue));
+       return FALSE;
+}
+
  static void
  concurrent_enqueue_check (GCObject *obj)
  {
@@ -195,12 +281,30 @@ thread_pool_init_func (void *data_untyped)
                 return;
  
         init_private_gray_queue (data);
+
+       if (worker_init_cb)
+               worker_init_cb (data);
+}
+
+static gboolean
+continue_idle_func (void *data_untyped)
+{
+       if (data_untyped) {
+               WorkerData *data = (WorkerData *)data_untyped;
+               return state_is_working_or_enqueued (data->state);
+       } else {
+               /* Return if any of the threads is working */
+               return !sgen_workers_all_done ();
+       }
  }
  
  static gboolean
-continue_idle_func (void)
+should_work_func (void *data_untyped)
  {
-       return state_is_working_or_enqueued (workers_state);
+       WorkerData *data = (WorkerData*)data_untyped;
+       int current_worker = (int) (data - workers_data);
+
+       return current_worker < active_workers_num;
  }
  
  static void
@@ -208,28 +312,28 @@ marker_idle_func (void *data_untyped)
  {
         WorkerData *data = (WorkerData *)data_untyped;
  
-       SGEN_ASSERT (0, continue_idle_func (), "Why are we called when we're not supposed to work?");
+       SGEN_ASSERT (0, continue_idle_func (data_untyped), "Why are we called when we're not supposed to work?");
         SGEN_ASSERT (0, sgen_concurrent_collection_in_progress (), "The worker should only mark in concurrent collections.");
  
-       if (workers_state == STATE_WORK_ENQUEUED) {
-               set_state (STATE_WORK_ENQUEUED, STATE_WORKING);
-               SGEN_ASSERT (0, workers_state != STATE_NOT_WORKING, "How did we get from WORK ENQUEUED to NOT WORKING?");
+       if (data->state == STATE_WORK_ENQUEUED) {
+               set_state (data, STATE_WORK_ENQUEUED, STATE_WORKING);
+               SGEN_ASSERT (0, data->state != STATE_NOT_WORKING, "How did we get from WORK ENQUEUED to NOT WORKING?");
         }
  
-       if (!forced_stop && (!sgen_gray_object_queue_is_empty (&data->private_gray_queue) || workers_get_work (data))) {
+       if (!forced_stop && (!sgen_gray_object_queue_is_empty (&data->private_gray_queue) || workers_get_work (data) || workers_steal_work (data))) {
                 ScanCopyContext ctx = CONTEXT_FROM_OBJECT_OPERATIONS (idle_func_object_ops, &data->private_gray_queue);
  
                 SGEN_ASSERT (0, !sgen_gray_object_queue_is_empty (&data->private_gray_queue), "How is our gray queue empty if we just got work?");
  
                 sgen_drain_gray_stack (ctx);
-       } else {
-               SgenThreadPoolJob *job = preclean_job;
-               if (job) {
-                       sgen_thread_pool_job_enqueue (job);
-                       preclean_job = NULL;
-               } else {
-                       worker_try_finish (data);
+
+               if (data->private_gray_queue.num_sections > 16 && workers_finished && worker_awakenings < active_workers_num) {
+                       /* We bound the number of worker awakenings just to be sure */
+                       worker_awakenings++;
+                       sgen_workers_ensure_awake ();
                 }
+       } else {
+               worker_try_finish (data);
         }
  }
  
@@ -256,29 +360,33 @@ sgen_workers_init_distribute_gray_queue (void)
  }
  
  void
-sgen_workers_init (int num_workers)
+sgen_workers_init (int num_workers, SgenWorkerCallback callback)
  {
         int i;
         void **workers_data_ptrs = (void **)alloca(num_workers * sizeof(void *));
  
         if (!sgen_get_major_collector ()->is_concurrent) {
-               sgen_thread_pool_init (num_workers, thread_pool_init_func, NULL, NULL, NULL);
+               sgen_thread_pool_init (num_workers, thread_pool_init_func, NULL, NULL, NULL, NULL);
                 return;
         }
  
+       mono_os_mutex_init (&finished_lock);
         //g_print ("initing %d workers\n", num_workers);
  
         workers_num = num_workers;
+       active_workers_num = num_workers;
  
         workers_data = (WorkerData *)sgen_alloc_internal_dynamic (sizeof (WorkerData) * num_workers, INTERNAL_MEM_WORKER_DATA, TRUE);
         memset (workers_data, 0, sizeof (WorkerData) * num_workers);
  
         init_distribute_gray_queue ();
  
-       for (i = 0; i < workers_num; ++i)
+       for (i = 0; i < num_workers; ++i)
                 workers_data_ptrs [i] = (void *) &workers_data [i];
  
-       sgen_thread_pool_init (num_workers, thread_pool_init_func, marker_idle_func, continue_idle_func, workers_data_ptrs);
+       worker_init_cb = callback;
+
+       sgen_thread_pool_init (num_workers, thread_pool_init_func, marker_idle_func, continue_idle_func, should_work_func, workers_data_ptrs);
  
         mono_counters_register ("# workers finished", MONO_COUNTER_GC | MONO_COUNTER_ULONG, &stat_workers_num_finished);
  }
@@ -286,21 +394,34 @@ sgen_workers_init (int num_workers)
  void
  sgen_workers_stop_all_workers (void)
  {
-       preclean_job = NULL;
+       finish_callback = NULL;
         mono_memory_write_barrier ();
         forced_stop = TRUE;
  
         sgen_thread_pool_wait_for_all_jobs ();
         sgen_thread_pool_idle_wait ();
-       SGEN_ASSERT (0, workers_state == STATE_NOT_WORKING, "Can only signal enqueue work when in no work state");
+       SGEN_ASSERT (0, sgen_workers_all_done (), "Can only signal enqueue work when in no work state");
+}
+
+void
+sgen_workers_set_num_active_workers (int num_workers)
+{
+       if (num_workers) {
+               SGEN_ASSERT (0, active_workers_num <= workers_num, "We can't start more workers than we initialized");
+               active_workers_num = num_workers;
+       } else {
+               active_workers_num = workers_num;
+       }
  }
  
  void
-sgen_workers_start_all_workers (SgenObjectOperations *object_ops, SgenThreadPoolJob *job)
+sgen_workers_start_all_workers (SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par, SgenWorkersFinishCallback callback)
  {
+       idle_func_object_ops_par = object_ops_par;
+       idle_func_object_ops_nopar = object_ops_nopar;
         forced_stop = FALSE;
-       idle_func_object_ops = object_ops;
-       preclean_job = job;
+       finish_callback = callback;
+       worker_awakenings = 0;
         mono_memory_write_barrier ();
  
         sgen_workers_ensure_awake ();
@@ -313,12 +434,12 @@ sgen_workers_join (void)
  
         sgen_thread_pool_wait_for_all_jobs ();
         sgen_thread_pool_idle_wait ();
-       SGEN_ASSERT (0, workers_state == STATE_NOT_WORKING, "Can only signal enqueue work when in no work state");
+       SGEN_ASSERT (0, sgen_workers_all_done (), "Can only signal enqueue work when in no work state");
  
         /* At this point all the workers have stopped. */
  
         SGEN_ASSERT (0, sgen_section_gray_queue_is_empty (&workers_distribute_gray_queue), "Why is there still work left to do?");
-       for (i = 0; i < workers_num; ++i)
+       for (i = 0; i < active_workers_num; ++i)
                 SGEN_ASSERT (0, sgen_gray_object_queue_is_empty (&workers_data [i].private_gray_queue), "Why is there still work left to do?");
  }
  
@@ -336,7 +457,7 @@ sgen_workers_have_idle_work (void)
         if (!sgen_section_gray_queue_is_empty (&workers_distribute_gray_queue))
                 return TRUE;
  
-       for (i = 0; i < workers_num; ++i) {
+       for (i = 0; i < active_workers_num; ++i) {
                 if (!sgen_gray_object_queue_is_empty (&workers_data [i].private_gray_queue))
                         return TRUE;
         }
@@ -347,14 +468,20 @@ sgen_workers_have_idle_work (void)
  gboolean
  sgen_workers_all_done (void)
  {
-       return workers_state == STATE_NOT_WORKING;
+       int i;
+
+       for (i = 0; i < active_workers_num; i++) {
+               if (state_is_working_or_enqueued (workers_data [i].state))
+                       return FALSE;
+       }
+       return TRUE;
  }
  
  /* Must only be used for debugging */
  gboolean
  sgen_workers_are_working (void)
  {
-       return state_is_working_or_enqueued (workers_state);
+       return !sgen_workers_all_done ();
  }
  
  void
@@ -364,22 +491,45 @@ sgen_workers_assert_gray_queue_is_empty (void)
  }
  
  void
-sgen_workers_take_from_queue_and_awake (SgenGrayQueue *queue)
+sgen_workers_take_from_queue (SgenGrayQueue *queue)
  {
-       gboolean wake = FALSE;
+       sgen_gray_object_spread (queue, sgen_workers_get_job_split_count ());
  
         for (;;) {
                 GrayQueueSection *section = sgen_gray_object_dequeue_section (queue);
                 if (!section)
                         break;
                 sgen_section_gray_queue_enqueue (&workers_distribute_gray_queue, section);
-               wake = TRUE;
         }
  
-       if (wake) {
-               SGEN_ASSERT (0, sgen_concurrent_collection_in_progress (), "Why is there work to take when there's no concurrent collection in progress?");
-               sgen_workers_ensure_awake ();
-       }
+       SGEN_ASSERT (0, !sgen_workers_are_working (), "We should fully populate the distribute gray queue before we start the workers");
+}
+
+SgenObjectOperations*
+sgen_workers_get_idle_func_object_ops (void)
+{
+       return (idle_func_object_ops_par) ? idle_func_object_ops_par : idle_func_object_ops_nopar;
+}
+
+/*
+ * If we have a single worker, splitting into multiple jobs makes no sense. With
+ * more than one worker, we split into a larger number of jobs so that, in case
+ * the work load is uneven, a worker that finished quickly can take up more jobs
+ * than another one.
+ */
+int
+sgen_workers_get_job_split_count (void)
+{
+       return (active_workers_num > 1) ? active_workers_num * 4 : 1;
+}
+
+void
+sgen_workers_foreach (SgenWorkerCallback callback)
+{
+       int i;
+
+       for (i = 0; i < workers_num; i++)
+               callback (&workers_data [i]);
  }
  
  #endif
diff --git a/mono/sgen/sgen-workers.h b/mono/sgen/sgen-workers.h

index 1a66c79a48c238fd5ae4e3b589bc4e6e8f5ca4bf..4c84ea10219ca2208bb3bd380af152e08540b7c8 100644 (file)
--- a/mono/sgen/sgen-workers.h
+++ b/mono/sgen/sgen-workers.h
@@ -14,15 +14,26 @@
  
  typedef struct _WorkerData WorkerData;
  struct _WorkerData {
+       gint32 state;
         SgenGrayQueue private_gray_queue; /* only read/written by worker thread */
+       /*
+        * Workers allocate major objects only from here. It has same structure as the
+        * global one. This is normally accessed from the worker_block_free_list_key.
+        * We hold it here so we can clear free lists from all threads before sweep
+        * starts.
+        */
+       gpointer free_block_lists;
  };
  
-void sgen_workers_init (int num_workers);
+typedef void (*SgenWorkersFinishCallback) (void);
+typedef void (*SgenWorkerCallback) (WorkerData *data);
+
+void sgen_workers_init (int num_workers, SgenWorkerCallback callback);
  void sgen_workers_stop_all_workers (void);
-void sgen_workers_start_all_workers (SgenObjectOperations *object_ops, SgenThreadPoolJob *finish_job);
+void sgen_workers_set_num_active_workers (int num_workers);
+void sgen_workers_start_all_workers (SgenObjectOperations *object_ops_nopar, SgenObjectOperations *object_ops_par, SgenWorkersFinishCallback finish_job);
  void sgen_workers_init_distribute_gray_queue (void);
  void sgen_workers_enqueue_job (SgenThreadPoolJob *job, gboolean enqueue);
-void sgen_workers_wait_for_jobs_finished (void);
  void sgen_workers_distribute_gray_queue_sections (void);
  void sgen_workers_reset_data (void);
  void sgen_workers_join (void);
@@ -30,6 +41,9 @@ gboolean sgen_workers_have_idle_work (void);
  gboolean sgen_workers_all_done (void);
  gboolean sgen_workers_are_working (void);
  void sgen_workers_assert_gray_queue_is_empty (void);
-void sgen_workers_take_from_queue_and_awake (SgenGrayQueue *queue);
+void sgen_workers_take_from_queue (SgenGrayQueue *queue);
+SgenObjectOperations* sgen_workers_get_idle_func_object_ops (void);
+int sgen_workers_get_job_split_count (void);
+void sgen_workers_foreach (SgenWorkerCallback callback);
  
  #endif
diff --git a/mono/tests/Makefile.am b/mono/tests/Makefile.am

index 14d9edc4c526fddb4c75531473b4012ed096801f..1fb49280a075f40222265663636f3351d9f67998 100644 (file)
--- a/mono/tests/Makefile.am
+++ b/mono/tests/Makefile.am
@@ -1246,6 +1246,7 @@ SGEN_REGULAR_TESTS=$(filter-out $(SGEN_REGULAR_DISABLED_TESTS),$(SGEN_REGULAR_TE
  sgen-regular-tests: $(SGEN_REGULAR_TESTS)
         $(MAKE) sgen-regular-tests-ms
         $(MAKE) sgen-regular-tests-ms-conc
+       $(MAKE) sgen-regular-tests-ms-conc-par
         $(MAKE) sgen-regular-tests-ms-conc-split
         $(MAKE) sgen-regular-tests-ms-split
         $(MAKE) sgen-regular-tests-ms-conc-split-95
@@ -1257,6 +1258,8 @@ sgen-regular-tests-ms: $(SGEN_REGULAR_TESTS) test-runner.exe
         MONO_ENV_OPTIONS="--gc=sgen" MONO_GC_DEBUG="" MONO_GC_PARAMS="major=marksweep" $(RUNTIME) $(TEST_RUNNER) $(TEST_RUNNER_ARGS) --testsuite-name $@ --disabled "$(DISABLED_TESTS)" --timeout 900 $(SGEN_REGULAR_TESTS)
  sgen-regular-tests-ms-conc: $(SGEN_REGULAR_TESTS) test-runner.exe
         MONO_ENV_OPTIONS="--gc=sgen" MONO_GC_DEBUG="" MONO_GC_PARAMS="major=marksweep-conc" $(RUNTIME) $(TEST_RUNNER) $(TEST_RUNNER_ARGS) --testsuite-name $@ --disabled "$(DISABLED_TESTS)" --timeout 900 $(SGEN_REGULAR_TESTS)
+sgen-regular-tests-ms-conc-par: $(SGEN_REGULAR_TESTS) test-runner.exe
+       MONO_ENV_OPTIONS="--gc=sgen" MONO_GC_DEBUG="" MONO_GC_PARAMS="major=marksweep-conc-par" $(RUNTIME) $(TEST_RUNNER) $(TEST_RUNNER_ARGS) --testsuite-name $@ --disabled "$(DISABLED_TESTS)" --timeout 900 $(SGEN_REGULAR_TESTS)
  sgen-regular-tests-ms-conc-split: $(SGEN_REGULAR_TESTS) test-runner.exe
         MONO_ENV_OPTIONS="--gc=sgen" MONO_GC_DEBUG="" MONO_GC_PARAMS="major=marksweep-conc,minor=split" $(RUNTIME) $(TEST_RUNNER) $(TEST_RUNNER_ARGS) --testsuite-name $@ --disabled "$(DISABLED_TESTS)" --timeout 900 $(SGEN_REGULAR_TESTS)
  sgen-regular-tests-ms-split: $(SGEN_REGULAR_TESTS) test-runner.exe
diff --git a/tools/sgen/sgen-grep-binprot.c b/tools/sgen/sgen-grep-binprot.c

index 4ca309f38eb6d3d8ef67ea96e4a2290bb0395b24..322f47a72ad518931ebcd2e8b1a8ae56807fba78 100644 (file)
--- a/tools/sgen/sgen-grep-binprot.c
+++ b/tools/sgen/sgen-grep-binprot.c
@@ -18,6 +18,8 @@
  #include "sgen-entry-stream.h"
  #include "sgen-grep-binprot.h"
  
+static int file_version = 0;
+
  #ifdef BINPROT_HAS_HEADER
  #define PACKED_SUFFIX  p
  #else
@@ -57,13 +59,23 @@ typedef int64_t mword;
  #define MAX_ENTRY_SIZE (1 << 10)
  
  static int
-read_entry (EntryStream *stream, void *data)
+read_entry (EntryStream *stream, void *data, unsigned char *windex)
  {
         unsigned char type;
         ssize_t size;
  
         if (read_stream (stream, &type, 1) <= 0)
                 return SGEN_PROTOCOL_EOF;
+
+       if (windex) {
+               if (file_version >= 2) {
+                       if (read_stream (stream, windex, 1) <= 0)
+                               return SGEN_PROTOCOL_EOF;
+               } else {
+                       *windex = !!(WORKER (type));
+               }
+       }
+
         switch (TYPE (type)) {
  
  #define BEGIN_PROTOCOL_ENTRY0(method) \
@@ -172,8 +184,6 @@ is_always_match (int type)
         }
  }
  
-#define WORKER_PREFIX(t)       (WORKER ((t)) ? "w" : " ")
-
  enum { NO_COLOR = -1 };
  
  typedef struct {
@@ -238,10 +248,13 @@ index_color (int index, int num_nums, int *match_indices)
  }
  
  static void
-print_entry (int type, void *data, int num_nums, int *match_indices, gboolean color_output)
+print_entry (int type, void *data, int num_nums, int *match_indices, gboolean color_output, unsigned char worker_index)
  {
         const char *always_prefix = is_always_match (type) ? "  " : "";
-       printf ("%s%s ", WORKER_PREFIX (type), always_prefix);
+       if (worker_index)
+               printf ("w%-2d%s ", worker_index, always_prefix);
+       else
+               printf ("   %s ", always_prefix);
  
         switch (TYPE (type)) {
  
@@ -566,13 +579,19 @@ sgen_binary_protocol_read_header (EntryStream *stream)
  {
  #ifdef BINPROT_HAS_HEADER
         char data [MAX_ENTRY_SIZE];
-       int type = read_entry (stream, data);
+       int type = read_entry (stream, data, NULL);
         if (type == SGEN_PROTOCOL_EOF)
                 return FALSE;
         if (type == PROTOCOL_ID (binary_protocol_header)) {
                 PROTOCOL_STRUCT (binary_protocol_header) * str = (PROTOCOL_STRUCT (binary_protocol_header) *) data;
-               if (str->check == PROTOCOL_HEADER_CHECK && str->ptr_size == BINPROT_SIZEOF_VOID_P)
+               if (str->check == PROTOCOL_HEADER_CHECK && str->ptr_size == BINPROT_SIZEOF_VOID_P) {
+                       if (str->version > PROTOCOL_HEADER_VERSION) {
+                               fprintf (stderr, "The file contains a newer version %d. We support up to %d. Please update.\n", str->version, PROTOCOL_HEADER_VERSION);
+                               exit (1);
+                       }
+                       file_version = str->version;
                         return TRUE;
+               }
         }
         return FALSE;
  #else
@@ -595,6 +614,7 @@ GREP_ENTRIES_FUNCTION_NAME (EntryStream *stream, int num_nums, long nums [], int
                         gboolean dump_all, gboolean pause_times, gboolean color_output, unsigned long long first_entry_to_consider)
  {
         int type;
+       unsigned char worker_index;
         void *data = g_malloc0 (MAX_ENTRY_SIZE);
         int i;
         gboolean pause_times_stopped = FALSE;
@@ -607,7 +627,7 @@ GREP_ENTRIES_FUNCTION_NAME (EntryStream *stream, int num_nums, long nums [], int
                 return FALSE;
  
         entry_index = 0;
-       while ((type = read_entry (stream, data)) != SGEN_PROTOCOL_EOF) {
+       while ((type = read_entry (stream, data, &worker_index)) != SGEN_PROTOCOL_EOF) {
                 if (entry_index < first_entry_to_consider)
                         goto next_entry;
                 if (pause_times) {
@@ -662,7 +682,7 @@ GREP_ENTRIES_FUNCTION_NAME (EntryStream *stream, int num_nums, long nums [], int
                         if (dump_all)
                                 printf (match ? "* " : "  ");
                         if (match || dump_all)
-                               print_entry (type, data, num_nums, match_indices, color_output);
+                               print_entry (type, data, num_nums, match_indices, color_output, worker_index);
                 }
         next_entry:
                 ++entry_index;
author	Vlad Brezae <brezaevlad@gmail.com>
	Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)
committer	GitHub <noreply@github.com>
	Tue, 24 Jan 2017 11:03:05 +0000 (13:03 +0200)
mono/metadata/sgen-mono.c		patch \| blob \| history
mono/profiler/Makefile.am		patch \| blob \| history
mono/sgen/sgen-array-list.c		patch \| blob \| history
mono/sgen/sgen-array-list.h		patch \| blob \| history
mono/sgen/sgen-cardtable.c		patch \| blob \| history
mono/sgen/sgen-client.h		patch \| blob \| history
mono/sgen/sgen-copy-object.h		patch \| blob \| history
mono/sgen/sgen-gc.c		patch \| blob \| history
mono/sgen/sgen-gc.h		patch \| blob \| history
mono/sgen/sgen-gray.c		patch \| blob \| history
mono/sgen/sgen-gray.h		patch \| blob \| history
mono/sgen/sgen-los.c		patch \| blob \| history
mono/sgen/sgen-marksweep-drain-gray-stack.h		patch \| blob \| history
mono/sgen/sgen-marksweep.c		patch \| blob \| history
mono/sgen/sgen-minor-scan-object.h		patch \| blob \| history
mono/sgen/sgen-nursery-allocator.c		patch \| blob \| history
mono/sgen/sgen-pinning.c		patch \| blob \| history
mono/sgen/sgen-protocol.c		patch \| blob \| history
mono/sgen/sgen-protocol.h		patch \| blob \| history
mono/sgen/sgen-thread-pool.c		patch \| blob \| history
mono/sgen/sgen-thread-pool.h		patch \| blob \| history
mono/sgen/sgen-workers.c		patch \| blob \| history
mono/sgen/sgen-workers.h		patch \| blob \| history
mono/tests/Makefile.am		patch \| blob \| history
tools/sgen/sgen-grep-binprot.c		patch \| blob \| history