Merge pull request #2869 from BrzVlad/feature-mod-union-opt
[mono.git] / mono / sgen / sgen-marksweep.c
index 929d86031fee406233d691cde0c4933781f1e9ad..4c63191ad447d0ea5c9fb76bf44f9eba3ee19b46 100644 (file)
@@ -7,18 +7,7 @@
  * Copyright 2009-2010 Novell, Inc.
  * Copyright (C) 2012 Xamarin Inc
  *
- * This library is free software; you can redistribute it and/or
- * modify it under the terms of the GNU Library General Public
- * License 2.0 as published by the Free Software Foundation;
- *
- * This library is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
- * Library General Public License for more details.
- *
- * You should have received a copy of the GNU Library General Public
- * License 2.0 along with this library; if not, write to the Free
- * Software Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ * Licensed under the MIT license. See LICENSE file in the project root for full license information.
  */
 
 #include "config.h"
 #include "mono/sgen/sgen-memory-governor.h"
 #include "mono/sgen/sgen-layout-stats.h"
 #include "mono/sgen/sgen-pointer-queue.h"
+#include "mono/sgen/sgen-array-list.h"
 #include "mono/sgen/sgen-pinning.h"
 #include "mono/sgen/sgen-workers.h"
 #include "mono/sgen/sgen-thread-pool.h"
 #include "mono/sgen/sgen-client.h"
-#include "mono/utils/mono-membar.h"
+#include "mono/utils/mono-memory-model.h"
 
 #if defined(ARCH_MIN_MS_BLOCK_SIZE) && defined(ARCH_MIN_MS_BLOCK_SIZE_SHIFT)
 #define MS_BLOCK_SIZE  ARCH_MIN_MS_BLOCK_SIZE
@@ -109,6 +99,7 @@ struct _MSBlockInfo {
        guint16 obj_size_index;
        /* FIXME: Reduce this - it only needs a byte. */
        volatile gint32 state;
+       gint16 nused;
        unsigned int pinned : 1;
        unsigned int has_references : 1;
        unsigned int has_pinned : 1;    /* means cannot evacuate */
@@ -171,7 +162,7 @@ static int fast_block_obj_size_indexes [MS_NUM_FAST_BLOCK_OBJ_SIZE_INDEXES];
 static gboolean *evacuate_block_obj_sizes;
 static float evacuation_threshold = 0.666f;
 
-static gboolean lazy_sweep = FALSE;
+static gboolean lazy_sweep = TRUE;
 
 enum {
        SWEEP_STATE_SWEPT,
@@ -197,27 +188,25 @@ static gboolean concurrent_sweep = TRUE;
 #define BLOCK_TAG(bl)                          ((bl)->has_references ? BLOCK_TAG_HAS_REFERENCES ((bl)) : (bl))
 
 /* all allocated blocks in the system */
-static SgenPointerQueue allocated_blocks;
+static SgenArrayList allocated_blocks = SGEN_ARRAY_LIST_INIT (NULL, NULL, NULL, INTERNAL_MEM_PIN_QUEUE);
 
 /* non-allocated block free-list */
 static void *empty_blocks = NULL;
 static size_t num_empty_blocks = 0;
 
-#define FOREACH_BLOCK_NO_LOCK_CONDITION(cond,bl) {                     \
-       size_t __index;                                                 \
-       SGEN_ASSERT (0, (cond) && !sweep_in_progress (), "Can't iterate blocks while the world is running or sweep is in progress."); \
-       for (__index = 0; __index < allocated_blocks.next_slot; ++__index) { \
-               (bl) = BLOCK_UNTAG (allocated_blocks.data [__index]);
-#define FOREACH_BLOCK_NO_LOCK(bl)                                      \
-       FOREACH_BLOCK_NO_LOCK_CONDITION(sgen_is_world_stopped (), bl)
+#define FOREACH_BLOCK_NO_LOCK(bl) {                                    \
+       volatile gpointer *slot;                                                \
+       SGEN_ASSERT (0, !sweep_in_progress (), "Can't iterate blocks while sweep is in progress."); \
+       SGEN_ARRAY_LIST_FOREACH_SLOT (&allocated_blocks, slot) {        \
+               (bl) = BLOCK_UNTAG (*slot);
 #define FOREACH_BLOCK_HAS_REFERENCES_NO_LOCK(bl,hr) {                  \
-       size_t __index;                                                 \
-       SGEN_ASSERT (0, sgen_is_world_stopped () && !sweep_in_progress (), "Can't iterate blocks while the world is running or sweep is in progress."); \
-       for (__index = 0; __index < allocated_blocks.next_slot; ++__index) { \
-               (bl) = (MSBlockInfo *)allocated_blocks.data [__index];                  \
+       volatile gpointer *slot;                                                \
+       SGEN_ASSERT (0, !sweep_in_progress (), "Can't iterate blocks while sweep is in progress."); \
+       SGEN_ARRAY_LIST_FOREACH_SLOT (&allocated_blocks, slot) {        \
+               (bl) = (MSBlockInfo *) (*slot);                 \
                (hr) = BLOCK_IS_TAGGED_HAS_REFERENCES ((bl));           \
                (bl) = BLOCK_UNTAG ((bl));
-#define END_FOREACH_BLOCK_NO_LOCK      } }
+#define END_FOREACH_BLOCK_NO_LOCK      } SGEN_ARRAY_LIST_END_FOREACH_SLOT; }
 
 static volatile size_t num_major_sections = 0;
 /*
@@ -386,11 +375,14 @@ ms_get_empty_block (void)
  * list, where it will either be freed later on, or reused in nursery collections.
  */
 static void
-ms_free_block (void *block)
+ms_free_block (MSBlockInfo *info)
 {
        void *empty;
+       char *block = MS_BLOCK_FOR_BLOCK_INFO (info);
 
        sgen_memgov_release_space (MS_BLOCK_SIZE, SPACE_MAJOR);
+       if (info->cardtable_mod_union)
+               sgen_card_table_free_mod_union (info->cardtable_mod_union, block, MS_BLOCK_SIZE);
        memset (block, 0, MS_BLOCK_SIZE);
 
        do {
@@ -436,7 +428,7 @@ check_block_free_list (MSBlockInfo *block, int size, gboolean pinned)
                g_assert (block->free_list);
 
                /* the block must be in the allocated_blocks array */
-               g_assert (sgen_pointer_queue_find (&allocated_blocks, BLOCK_TAG (block)) != (size_t)-1);
+               g_assert (sgen_array_list_find (&allocated_blocks, BLOCK_TAG (block)) != (guint32)-1);
        }
 }
 
@@ -567,7 +559,7 @@ ms_alloc_block (int size_index, gboolean pinned, gboolean has_references)
        major_finish_sweep_checking ();
        mono_memory_barrier ();
 
-       sgen_pointer_queue_add (&allocated_blocks, BLOCK_TAG (info));
+       sgen_array_list_add (&allocated_blocks, BLOCK_TAG (info), 0, FALSE);
 
        SGEN_ATOMIC_ADD_P (num_major_sections, 1);
        return TRUE;
@@ -815,14 +807,15 @@ set_sweep_state (int new_, int expected)
        SGEN_ASSERT (0, success, "Could not set sweep state.");
 }
 
-static gboolean ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *have_checked);
+static gboolean ensure_block_is_checked_for_sweeping (guint32 block_index, gboolean wait, gboolean *have_checked);
 
 static SgenThreadPoolJob * volatile sweep_job;
+static SgenThreadPoolJob * volatile sweep_blocks_job;
 
 static void
 major_finish_sweep_checking (void)
 {
-       int block_index;
+       guint32 block_index;
        SgenThreadPoolJob *job;
 
  retry:
@@ -1076,7 +1069,6 @@ mark_mod_union_card (GCObject *obj, void **ptr, GCObject *value_obj)
        } else {
                sgen_los_mark_mod_union_card (obj, ptr);
        }
-
        binary_protocol_mod_union_remset (obj, ptr, value_obj, SGEN_LOAD_VTABLE (value_obj));
 }
 
@@ -1185,20 +1177,22 @@ static guint64 stat_drain_loops;
 #define COPY_OR_MARK_WITH_EVACUATION
 #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_with_evacuation
 #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_with_evacuation
+#define SCAN_VTYPE_FUNCTION_NAME       major_scan_vtype_with_evacuation
 #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_with_evacuation
+#define SCAN_PTR_FIELD_FUNCTION_NAME   major_scan_ptr_field_with_evacuation
 #include "sgen-marksweep-drain-gray-stack.h"
 
-#undef COPY_OR_MARK_WITH_EVACUATION
 #define COPY_OR_MARK_CONCURRENT
 #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_no_evacuation
 #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_no_evacuation
 #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_no_evacuation
 #include "sgen-marksweep-drain-gray-stack.h"
 
-#undef COPY_OR_MARK_CONCURRENT
 #define COPY_OR_MARK_CONCURRENT_WITH_EVACUATION
 #define COPY_OR_MARK_FUNCTION_NAME     major_copy_or_mark_object_concurrent_with_evacuation
 #define SCAN_OBJECT_FUNCTION_NAME      major_scan_object_concurrent_with_evacuation
+#define SCAN_VTYPE_FUNCTION_NAME       major_scan_vtype_concurrent_with_evacuation
+#define SCAN_PTR_FIELD_FUNCTION_NAME   major_scan_ptr_field_concurrent_with_evacuation
 #define DRAIN_GRAY_STACK_FUNCTION_NAME drain_gray_stack_concurrent_with_evacuation
 #include "sgen-marksweep-drain-gray-stack.h"
 
@@ -1233,8 +1227,6 @@ drain_gray_stack_concurrent (SgenGrayQueue *queue)
                return drain_gray_stack_concurrent_no_evacuation (queue);
 }
 
-#include "sgen-marksweep-scan-object-concurrent.h"
-
 static void
 major_copy_or_mark_object_canonical (GCObject **ptr, SgenGrayQueue *queue)
 {
@@ -1401,8 +1393,8 @@ bitcount (mword d)
        int count = 0;
 
 #ifdef __GNUC__
-       if (sizeof (mword) == sizeof (unsigned long))
-               count += __builtin_popcountl (d);
+       if (sizeof (mword) == 8)
+               count += __builtin_popcountll (d);
        else
                count += __builtin_popcount (d);
 #else
@@ -1449,7 +1441,7 @@ static void sweep_finish (void);
  * be correct, i.e. must not be used.
  */
 static gboolean
-ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *have_checked)
+ensure_block_is_checked_for_sweeping (guint32 block_index, gboolean wait, gboolean *have_checked)
 {
        int count;
        gboolean have_live = FALSE;
@@ -1459,6 +1451,7 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
        int i;
        void *tagged_block;
        MSBlockInfo *block;
+       volatile gpointer *block_slot = sgen_array_list_get_slot (&allocated_blocks, block_index);
 
        SGEN_ASSERT (6, sweep_in_progress (), "Why do we call this function if there's no sweep in progress?");
 
@@ -1466,7 +1459,7 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
                *have_checked = FALSE;
 
  retry:
-       tagged_block = *(void * volatile *)&allocated_blocks.data [block_index];
+       tagged_block = *(void * volatile *)block_slot;
        if (!tagged_block)
                return FALSE;
 
@@ -1478,7 +1471,7 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
                goto retry;
        }
 
-       if (SGEN_CAS_PTR (&allocated_blocks.data [block_index], BLOCK_TAG_CHECKING (tagged_block), tagged_block) != tagged_block)
+       if (SGEN_CAS_PTR (block_slot, BLOCK_TAG_CHECKING (tagged_block), tagged_block) != tagged_block)
                goto retry;
 
        block = BLOCK_UNTAG (tagged_block);
@@ -1517,15 +1510,14 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
 
        count = MS_BLOCK_FREE / block->obj_size;
 
-       if (block->cardtable_mod_union) {
-               sgen_card_table_free_mod_union (block->cardtable_mod_union, MS_BLOCK_FOR_BLOCK_INFO (block), MS_BLOCK_SIZE);
-               block->cardtable_mod_union = NULL;
-       }
+       if (block->cardtable_mod_union)
+               memset (block->cardtable_mod_union, 0, CARDS_PER_BLOCK);
 
        /* Count marked objects in the block */
        for (i = 0; i < MS_NUM_MARK_WORDS; ++i)
                nused += bitcount (block->mark_words [i]);
 
+       block->nused = nused;
        if (nused)
                have_live = TRUE;
        if (nused < count)
@@ -1572,7 +1564,7 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
                 * block list and freed.
                 */
                SGEN_ASSERT (6, block_index < allocated_blocks.next_slot, "How did the number of blocks shrink?");
-               SGEN_ASSERT (6, allocated_blocks.data [block_index] == BLOCK_TAG_CHECKING (tagged_block), "How did the block move?");
+               SGEN_ASSERT (6, *block_slot == BLOCK_TAG_CHECKING (tagged_block), "How did the block move?");
 
                binary_protocol_empty (MS_BLOCK_OBJ (block, 0), (char*)MS_BLOCK_OBJ (block, count) - (char*)MS_BLOCK_OBJ (block, 0));
                ms_free_block (block);
@@ -1583,15 +1575,29 @@ ensure_block_is_checked_for_sweeping (int block_index, gboolean wait, gboolean *
        }
 
  done:
-       allocated_blocks.data [block_index] = tagged_block;
+       *block_slot = tagged_block;
        return !!tagged_block;
 }
 
+static void
+sweep_blocks_job_func (void *thread_data_untyped, SgenThreadPoolJob *job)
+{
+       volatile gpointer *slot;
+
+       SGEN_ARRAY_LIST_FOREACH_SLOT (&allocated_blocks, slot) {
+               sweep_block (BLOCK_UNTAG (*slot));
+       } SGEN_ARRAY_LIST_END_FOREACH_SLOT;
+
+       mono_memory_write_barrier ();
+
+       sweep_blocks_job = NULL;
+}
+
 static void
 sweep_job_func (void *thread_data_untyped, SgenThreadPoolJob *job)
 {
-       int block_index;
-       int num_blocks = num_major_sections_before_sweep;
+       guint32 block_index;
+       guint32 num_blocks = num_major_sections_before_sweep;
 
        SGEN_ASSERT (0, sweep_in_progress (), "Sweep thread called with wrong state");
        SGEN_ASSERT (0, num_blocks <= allocated_blocks.next_slot, "How did we lose blocks?");
@@ -1601,14 +1607,12 @@ sweep_job_func (void *thread_data_untyped, SgenThreadPoolJob *job)
         * cooperate with the sweep thread to finish sweeping, and they will traverse from
         * low to high, to avoid constantly colliding on the same blocks.
         */
-       for (block_index = num_blocks - 1; block_index >= 0; --block_index) {
-               gboolean have_checked;
-
+       for (block_index = num_blocks; block_index-- > 0;) {
                /*
                 * The block might have been freed by another thread doing some checking
                 * work.
                 */
-               if (!ensure_block_is_checked_for_sweeping (block_index, TRUE, &have_checked))
+               if (!ensure_block_is_checked_for_sweeping (block_index, TRUE, NULL))
                        ++num_major_sections_freed_in_sweep;
        }
 
@@ -1623,13 +1627,22 @@ sweep_job_func (void *thread_data_untyped, SgenThreadPoolJob *job)
 
        if (SGEN_MAX_ASSERT_LEVEL >= 6) {
                for (block_index = num_blocks; block_index < allocated_blocks.next_slot; ++block_index) {
-                       MSBlockInfo *block = BLOCK_UNTAG (allocated_blocks.data [block_index]);
+                       MSBlockInfo *block = BLOCK_UNTAG (*sgen_array_list_get_slot (&allocated_blocks, block_index));
                        SGEN_ASSERT (6, block && block->state == BLOCK_STATE_SWEPT, "How did a new block to be swept get added while swept?");
                }
        }
 
-       sgen_pointer_queue_remove_nulls (&allocated_blocks);
-       mono_memory_barrier ();
+       sgen_array_list_remove_nulls (&allocated_blocks);
+
+       /*
+        * Concurrently sweep all the blocks to reduce workload during minor
+        * pauses where we need certain blocks to be swept. At the start of
+        * the next major we need all blocks to be swept anyway.
+        */
+       if (concurrent_sweep && lazy_sweep) {
+               sweep_blocks_job = sgen_thread_pool_job_alloc ("sweep_blocks", sweep_blocks_job_func, sizeof (SgenThreadPoolJob));
+               sgen_thread_pool_job_enqueue (sweep_blocks_job);
+       }
 
        sweep_finish ();
 
@@ -1654,7 +1667,11 @@ sweep_finish (void)
                }
        }
 
+       sgen_memgov_major_post_sweep ();
+
        set_sweep_state (SWEEP_STATE_SWEPT, SWEEP_STATE_COMPACTING);
+       if (concurrent_sweep)
+               binary_protocol_concurrent_sweep_end (sgen_timestamp ());
 }
 
 static void
@@ -1791,6 +1808,67 @@ major_finish_nursery_collection (void)
 #endif
 }
 
+static int
+block_usage_comparer (const void *bl1, const void *bl2)
+{
+       const gint16 nused1 = (*(MSBlockInfo**)bl1)->nused;
+       const gint16 nused2 = (*(MSBlockInfo**)bl2)->nused;
+
+       return nused2 - nused1;
+}
+
+static void
+sgen_evacuation_freelist_blocks (MSBlockInfo * volatile *block_list, int size_index)
+{
+       MSBlockInfo **evacuated_blocks;
+       size_t index = 0, count, num_blocks = 0, num_used = 0;
+       MSBlockInfo *info;
+       MSBlockInfo * volatile *prev;
+
+       for (info = *block_list; info != NULL; info = info->next_free) {
+               num_blocks++;
+               num_used += info->nused;
+       }
+
+       /*
+        * We have a set of blocks in the freelist which will be evacuated. Instead
+        * of evacuating all of the blocks into new ones, we traverse the freelist
+        * sorting it by the number of occupied slots, evacuating the objects from
+        * blocks with fewer used slots into fuller blocks.
+        *
+        * The number of used slots is set at the end of the previous sweep. Since
+        * we sequentially unlink slots from blocks, except for the head of the
+        * freelist, for blocks on the freelist, the number of used slots is the same
+        * as at the end of the previous sweep.
+        */
+       evacuated_blocks = (MSBlockInfo**)sgen_alloc_internal_dynamic (sizeof (MSBlockInfo*) * num_blocks, INTERNAL_MEM_TEMPORARY, TRUE);
+
+       for (info = *block_list; info != NULL; info = info->next_free) {
+               evacuated_blocks [index++] = info;
+       }
+
+       SGEN_ASSERT (0, num_blocks == index, "Why did the freelist change ?");
+
+       qsort (evacuated_blocks, num_blocks, sizeof (gpointer), block_usage_comparer);
+
+       /*
+        * Form a new freelist with the fullest blocks. These blocks will also be
+        * marked as to_space so we don't evacuate from them.
+        */
+       count = MS_BLOCK_FREE / block_obj_sizes [size_index];
+       prev = block_list;
+       for (index = 0; index < (num_used + count - 1) / count; index++) {
+               SGEN_ASSERT (0, index < num_blocks, "Why do we need more blocks for compaction than we already had ?");
+               info = evacuated_blocks [index];
+               info->is_to_space = TRUE;
+               *prev = info;
+               prev = &info->next_free;
+       }
+       *prev = NULL;
+
+       sgen_free_internal_dynamic (evacuated_blocks, sizeof (MSBlockInfo*) * num_blocks, INTERNAL_MEM_TEMPORARY);
+}
+
 static void
 major_start_major_collection (void)
 {
@@ -1809,22 +1887,38 @@ major_start_major_collection (void)
 
                binary_protocol_evacuating_blocks (block_obj_sizes [i]);
 
-               free_block_lists [0][i] = NULL;
-               free_block_lists [MS_BLOCK_FLAG_REFS][i] = NULL;
+               sgen_evacuation_freelist_blocks (&free_block_lists [0][i], i);
+               sgen_evacuation_freelist_blocks (&free_block_lists [MS_BLOCK_FLAG_REFS][i], i);
        }
 
-       if (lazy_sweep)
-               binary_protocol_sweep_begin (GENERATION_OLD, TRUE);
+       if (lazy_sweep && concurrent_sweep) {
+               /*
+                * sweep_blocks_job is created before sweep_finish, which we wait for above
+                * (major_finish_sweep_checking). After the end of sweep, if we don't have
+                * sweep_blocks_job set, it means that it has already been run.
+                */
+               SgenThreadPoolJob *job = sweep_blocks_job;
+               if (job)
+                       sgen_thread_pool_job_wait (job);
+       }
 
+       if (lazy_sweep && !concurrent_sweep)
+               binary_protocol_sweep_begin (GENERATION_OLD, TRUE);
        /* Sweep all unswept blocks and set them to MARKING */
        FOREACH_BLOCK_NO_LOCK (block) {
-               if (lazy_sweep)
+               if (lazy_sweep && !concurrent_sweep)
                        sweep_block (block);
                SGEN_ASSERT (0, block->state == BLOCK_STATE_SWEPT, "All blocks must be swept when we're pinning.");
                set_block_state (block, BLOCK_STATE_MARKING, BLOCK_STATE_SWEPT);
+               /*
+                * Swept blocks that have a null free_list are full. Evacuation is not
+                * effective on these blocks since we expect them to have high usage anyway,
+                * given that the survival rate for majors is relatively high.
+                */
+               if (evacuate_block_obj_sizes [block->obj_size_index] && !block->free_list)
+                       block->is_to_space = TRUE;
        } END_FOREACH_BLOCK_NO_LOCK;
-
-       if (lazy_sweep)
+       if (lazy_sweep && !concurrent_sweep)
                binary_protocol_sweep_end (GENERATION_OLD, TRUE);
 
        set_sweep_state (SWEEP_STATE_NEED_SWEEPING, SWEEP_STATE_SWEPT);
@@ -2052,7 +2146,7 @@ major_get_used_size (void)
         */
        major_finish_sweep_checking ();
 
-       FOREACH_BLOCK_NO_LOCK_CONDITION (TRUE, block) {
+       FOREACH_BLOCK_NO_LOCK (block) {
                int count = MS_BLOCK_FREE / block->obj_size;
                void **iter;
                size += count * block->obj_size;
@@ -2186,13 +2280,14 @@ initial_skip_card (guint8 *card_data)
 #define MS_OBJ_ALLOCED_FAST(o,b)               (*(void**)(o) && (*(char**)(o) < (b) || *(char**)(o) >= (b) + MS_BLOCK_SIZE))
 
 static void
-scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyContext ctx)
+scan_card_table_for_block (MSBlockInfo *block, CardTableScanType scan_type, ScanCopyContext ctx)
 {
        SgenGrayQueue *queue = ctx.queue;
        ScanObjectFunc scan_func = ctx.ops->scan_object;
 #ifndef SGEN_HAVE_OVERLAPPING_CARDS
        guint8 cards_copy [CARDS_PER_BLOCK];
 #endif
+       guint8 cards_preclean [CARDS_PER_BLOCK];
        gboolean small_objects;
        int block_obj_size;
        char *block_start;
@@ -2200,6 +2295,10 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
        guint8 *card_data_end;
        char *scan_front = NULL;
 
+       /* The concurrent mark doesn't enter evacuating blocks */
+       if (scan_type == CARDTABLE_SCAN_MOD_UNION_PRECLEAN && major_block_is_evacuating (block))
+               return;
+
        block_obj_size = block->obj_size;
        small_objects = block_obj_size < CARD_SIZE_IN_BYTES;
 
@@ -2212,7 +2311,7 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
         * Cards aliasing happens in powers of two, so as long as major blocks are aligned to their
         * sizes, they won't overflow the cardtable overlap modulus.
         */
-       if (mod_union) {
+       if (scan_type & CARDTABLE_SCAN_MOD_UNION) {
                card_data = card_base = block->cardtable_mod_union;
                /*
                 * This happens when the nursery collection that precedes finishing
@@ -2220,6 +2319,11 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
                 */
                if (!card_data)
                        return;
+
+               if (scan_type == CARDTABLE_SCAN_MOD_UNION_PRECLEAN) {
+                       sgen_card_table_preclean_mod_union (card_data, cards_preclean, CARDS_PER_BLOCK);
+                       card_data = card_base = cards_preclean;
+               }
        } else {
 #ifdef SGEN_HAVE_OVERLAPPING_CARDS
                card_data = card_base = sgen_card_table_get_card_scan_address ((mword)block_start);
@@ -2278,7 +2382,7 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
                        if (obj < scan_front || !MS_OBJ_ALLOCED_FAST (obj, block_start))
                                goto next_object;
 
-                       if (mod_union) {
+                       if (scan_type & CARDTABLE_SCAN_MOD_UNION) {
                                /* FIXME: do this more efficiently */
                                int w, b;
                                MS_CALC_MARK_BIT (w, b, obj);
@@ -2293,7 +2397,7 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
                                scan_func (object, sgen_obj_get_descriptor (object), queue);
                        } else {
                                size_t offset = sgen_card_table_get_card_offset (obj, block_start);
-                               sgen_cardtable_scan_object (object, block_obj_size, card_base + offset, mod_union, ctx);
+                               sgen_cardtable_scan_object (object, block_obj_size, card_base + offset, ctx);
                        }
                next_object:
                        obj += block_obj_size;
@@ -2311,34 +2415,36 @@ scan_card_table_for_block (MSBlockInfo *block, gboolean mod_union, ScanCopyConte
 }
 
 static void
-major_scan_card_table (gboolean mod_union, ScanCopyContext ctx)
+major_scan_card_table (CardTableScanType scan_type, ScanCopyContext ctx)
 {
        MSBlockInfo *block;
        gboolean has_references;
 
        if (!concurrent_mark)
-               g_assert (!mod_union);
+               g_assert (scan_type == CARDTABLE_SCAN_GLOBAL);
 
        major_finish_sweep_checking ();
-       binary_protocol_major_card_table_scan_start (sgen_timestamp (), mod_union);
+       binary_protocol_major_card_table_scan_start (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
        FOREACH_BLOCK_HAS_REFERENCES_NO_LOCK (block, has_references) {
 #ifdef PREFETCH_CARDS
                int prefetch_index = __index + 6;
                if (prefetch_index < allocated_blocks.next_slot) {
-                       MSBlockInfo *prefetch_block = BLOCK_UNTAG (allocated_blocks.data [prefetch_index]);
-                       guint8 *prefetch_cards = sgen_card_table_get_card_scan_address ((mword)MS_BLOCK_FOR_BLOCK_INFO (prefetch_block));
+                       MSBlockInfo *prefetch_block = BLOCK_UNTAG (*sgen_array_list_get_slot (&allocated_blocks, prefetch_index));
                        PREFETCH_READ (prefetch_block);
-                       PREFETCH_WRITE (prefetch_cards);
-                       PREFETCH_WRITE (prefetch_cards + 32);
+                       if (scan_type == CARDTABLE_SCAN_GLOBAL) {
+                               guint8 *prefetch_cards = sgen_card_table_get_card_scan_address ((mword)MS_BLOCK_FOR_BLOCK_INFO (prefetch_block));
+                               PREFETCH_WRITE (prefetch_cards);
+                               PREFETCH_WRITE (prefetch_cards + 32);
+                       }
                 }
 #endif
 
                if (!has_references)
                        continue;
 
-               scan_card_table_for_block (block, mod_union, ctx);
+               scan_card_table_for_block (block, scan_type, ctx);
        } END_FOREACH_BLOCK_NO_LOCK;
-       binary_protocol_major_card_table_scan_end (sgen_timestamp (), mod_union);
+       binary_protocol_major_card_table_scan_end (sgen_timestamp (), scan_type & CARDTABLE_SCAN_MOD_UNION);
 }
 
 static void
@@ -2379,10 +2485,21 @@ update_cardtable_mod_union (void)
        MSBlockInfo *block;
 
        FOREACH_BLOCK_NO_LOCK (block) {
-               size_t num_cards;
-               guint8 *mod_union = get_cardtable_mod_union_for_block (block, TRUE);
-               sgen_card_table_update_mod_union (mod_union, MS_BLOCK_FOR_BLOCK_INFO (block), MS_BLOCK_SIZE, &num_cards);
-               SGEN_ASSERT (6, num_cards == CARDS_PER_BLOCK, "Number of cards calculation is wrong");
+               gpointer *card_start = (gpointer*) sgen_card_table_get_card_address ((mword)MS_BLOCK_FOR_BLOCK_INFO (block));
+               gboolean has_dirty_cards = FALSE;
+               int i;
+               for (i = 0; i < CARDS_PER_BLOCK / sizeof(gpointer); i++) {
+                       if (card_start [i]) {
+                               has_dirty_cards = TRUE;
+                               break;
+                       }
+               }
+               if (has_dirty_cards) {
+                       size_t num_cards;
+                       guint8 *mod_union = get_cardtable_mod_union_for_block (block, TRUE);
+                       sgen_card_table_update_mod_union (mod_union, MS_BLOCK_FOR_BLOCK_INFO (block), MS_BLOCK_SIZE, &num_cards);
+                       SGEN_ASSERT (6, num_cards == CARDS_PER_BLOCK, "Number of cards calculation is wrong");
+               }
        } END_FOREACH_BLOCK_NO_LOCK;
 }
 
@@ -2496,11 +2613,14 @@ sgen_marksweep_init_internal (SgenMajorCollector *collector, gboolean is_concurr
        if (is_concurrent) {
                collector->major_ops_concurrent_start.copy_or_mark_object = major_copy_or_mark_object_concurrent_canonical;
                collector->major_ops_concurrent_start.scan_object = major_scan_object_concurrent_with_evacuation;
+               collector->major_ops_concurrent_start.scan_vtype = major_scan_vtype_concurrent_with_evacuation;
+               collector->major_ops_concurrent_start.scan_ptr_field = major_scan_ptr_field_concurrent_with_evacuation;
                collector->major_ops_concurrent_start.drain_gray_stack = drain_gray_stack_concurrent;
 
                collector->major_ops_concurrent_finish.copy_or_mark_object = major_copy_or_mark_object_concurrent_finish_canonical;
                collector->major_ops_concurrent_finish.scan_object = major_scan_object_with_evacuation;
-               collector->major_ops_concurrent_finish.scan_vtype = major_scan_vtype_concurrent_finish;
+               collector->major_ops_concurrent_finish.scan_vtype = major_scan_vtype_with_evacuation;
+               collector->major_ops_concurrent_finish.scan_ptr_field = major_scan_ptr_field_with_evacuation;
                collector->major_ops_concurrent_finish.drain_gray_stack = drain_gray_stack;
        }