From f7cf1b7e07ff3ca9cfe2954a8c788d45c0d5ef3a Mon Sep 17 00:00:00 2001
From: Vlad Brezae <brezaevlad@gmail.com>
Date: Wed, 28 Jun 2017 01:41:22 +0300
Subject: [PATCH] [sgen] Avoid waiting for workers to report their own finish

If a worker is stuck doing work in another context, it might not be very responsive in transitioning its own state from WORK_ENQUEUED to WORKING and finally to FINISHING. When joining worker threads, we check whether we have any threads working in the corresponding context and, if there aren't any, change their state directly from the gc thread.

We can do this because work is distributed from centralized pools (worker_distribute_gray_queue and the job_queue), instead of per worker. Therefore we don't need all the workers to perform actual work in the context.
---
 mono/sgen/sgen-thread-pool.c |  7 ++--
 mono/sgen/sgen-thread-pool.h |  3 +-
 mono/sgen/sgen-workers.c     | 63 +++++++++++++++++++++++++++---------
 3 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/mono/sgen/sgen-thread-pool.c b/mono/sgen/sgen-thread-pool.c
index 6a03bc3d69d..d1f6d99dadb 100644
--- a/mono/sgen/sgen-thread-pool.c
+++ b/mono/sgen/sgen-thread-pool.c
@@ -21,6 +21,7 @@ static mono_cond_t done_cond;
 
 static int threads_num;
 static MonoNativeThreadId threads [SGEN_THREADPOOL_MAX_NUM_THREADS];
+static int threads_context [SGEN_THREADPOOL_MAX_NUM_THREADS];
 
 static volatile gboolean threadpool_shutdown;
 static volatile int threads_finished;
@@ -190,7 +191,9 @@ thread_func (int worker_index)
 		SgenThreadPoolJob *job = NULL;
 		SgenThreadPoolContext *context = NULL;
 
+		threads_context [worker_index] = -1;
 		get_work (worker_index, &current_context, &do_idle, &job);
+		threads_context [worker_index] = current_context;
 
 		if (!threadpool_shutdown) {
 			context = &pool_contexts [current_context];
@@ -358,13 +361,13 @@ sgen_thread_pool_idle_signal (int context_id)
 }
 
 void
-sgen_thread_pool_idle_wait (int context_id)
+sgen_thread_pool_idle_wait (int context_id, SgenThreadPoolContinueIdleWaitFunc continue_wait)
 {
 	SGEN_ASSERT (0, pool_contexts [context_id].idle_job_func, "Why are we waiting for idle without an idle function?");
 
 	mono_os_mutex_lock (&lock);
 
-	while (pool_contexts [context_id].continue_idle_job_func (NULL, context_id))
+	while (continue_wait (context_id, threads_context))
 		mono_os_cond_wait (&done_cond, &lock);
 
 	mono_os_mutex_unlock (&lock);
diff --git a/mono/sgen/sgen-thread-pool.h b/mono/sgen/sgen-thread-pool.h
index b13848a5a80..b62e85fe35b 100644
--- a/mono/sgen/sgen-thread-pool.h
+++ b/mono/sgen/sgen-thread-pool.h
@@ -24,6 +24,7 @@ typedef void (*SgenThreadPoolThreadInitFunc) (void*);
 typedef void (*SgenThreadPoolIdleJobFunc) (void*);
 typedef gboolean (*SgenThreadPoolContinueIdleJobFunc) (void*, int);
 typedef gboolean (*SgenThreadPoolShouldWorkFunc) (void*);
+typedef gboolean (*SgenThreadPoolContinueIdleWaitFunc) (int, int*);
 
 struct _SgenThreadPoolJob {
 	const char *name;
@@ -60,7 +61,7 @@ void sgen_thread_pool_job_enqueue (int context_id, SgenThreadPoolJob *job);
 void sgen_thread_pool_job_wait (int context_id, SgenThreadPoolJob *job);
 
 void sgen_thread_pool_idle_signal (int context_id);
-void sgen_thread_pool_idle_wait (int context_id);
+void sgen_thread_pool_idle_wait (int context_id, SgenThreadPoolContinueIdleWaitFunc continue_wait);
 
 void sgen_thread_pool_wait_for_all_jobs (int context_id);
 
diff --git a/mono/sgen/sgen-workers.c b/mono/sgen/sgen-workers.c
index 7eb3740cc15..f999a5bdb2e 100644
--- a/mono/sgen/sgen-workers.c
+++ b/mono/sgen/sgen-workers.c
@@ -56,8 +56,6 @@ set_state (WorkerData *data, State old_state, State new_state)
 		SGEN_ASSERT (0, old_state == STATE_WORKING, "We can only transition to NOT WORKING from WORKING");
 	else if (new_state == STATE_WORKING)
 		SGEN_ASSERT (0, old_state == STATE_WORK_ENQUEUED, "We can only transition to WORKING from WORK ENQUEUED");
-	if (new_state == STATE_NOT_WORKING || new_state == STATE_WORKING)
-		SGEN_ASSERT (6, sgen_thread_pool_is_thread_pool_thread (mono_native_thread_id_get ()), "Only the worker thread is allowed to transition to NOT_WORKING or WORKING");
 
 	return InterlockedCompareExchange (&data->state, new_state, old_state) == old_state;
 }
@@ -378,6 +376,50 @@ sgen_workers_create_context (int generation, int num_workers)
 	}
 }
 
+/* This is called with thread pool lock so no context switch can happen */
+static gboolean
+continue_idle_wait (int calling_context, int *threads_context)
+{
+	WorkerContext *context;
+	int i;
+
+	if (worker_contexts [GENERATION_OLD].workers_num && calling_context == worker_contexts [GENERATION_OLD].thread_pool_context)
+		context = &worker_contexts [GENERATION_OLD];
+	else if (worker_contexts [GENERATION_NURSERY].workers_num && calling_context == worker_contexts [GENERATION_NURSERY].thread_pool_context)
+		context = &worker_contexts [GENERATION_NURSERY];
+	else
+		g_assert_not_reached ();
+
+	/*
+	 * We assume there are no pending jobs, since this is called only after
+	 * we waited for all the jobs.
+	 */
+	for (i = 0; i < context->active_workers_num; i++) {
+		if (threads_context [i] == calling_context)
+			return TRUE;
+	}
+
+	if (sgen_workers_have_idle_work (context->generation) && !context->forced_stop)
+		return TRUE;
+
+	/*
+	 * At this point there are no jobs to be done, and no objects to be scanned
+	 * in the gray queues. We can simply asynchronously finish all the workers
+	 * from the context that were not finished already (due to being stuck working
+	 * in another context)
+	 */
+
+	for (i = 0; i < context->active_workers_num; i++) {
+		if (context->workers_data [i].state == STATE_WORK_ENQUEUED)
+			set_state (&context->workers_data [i], STATE_WORK_ENQUEUED, STATE_WORKING);
+		if (context->workers_data [i].state == STATE_WORKING)
+			worker_try_finish (&context->workers_data [i]);
+	}
+
+	return FALSE;
+}
+
+
 void
 sgen_workers_stop_all_workers (int generation)
 {
@@ -390,7 +432,7 @@ sgen_workers_stop_all_workers (int generation)
 	context->forced_stop = TRUE;
 
 	sgen_thread_pool_wait_for_all_jobs (context->thread_pool_context);
-	sgen_thread_pool_idle_wait (context->thread_pool_context);
+	sgen_thread_pool_idle_wait (context->thread_pool_context, continue_idle_wait);
 	SGEN_ASSERT (0, !sgen_workers_are_working (context), "Can only signal enqueue work when in no work state");
 
 	context->started = FALSE;
@@ -438,16 +480,9 @@ sgen_workers_join (int generation)
 	int i;
 
 	SGEN_ASSERT (0, !context->finish_callback, "Why are we joining concurrent mark early");
-	/*
-	 * It might be the case that a worker didn't get to run anything
-	 * in this context, because it was stuck working on a long job
-	 * in another context. In this case its state is active (WORK_ENQUEUED)
-	 * and we need to wait for it to finish itself.
-	 * FIXME Avoid having to wait for the worker to report its own finish.
-	 */
 
 	sgen_thread_pool_wait_for_all_jobs (context->thread_pool_context);
-	sgen_thread_pool_idle_wait (context->thread_pool_context);
+	sgen_thread_pool_idle_wait (context->thread_pool_context, continue_idle_wait);
 	SGEN_ASSERT (0, !sgen_workers_are_working (context), "Can only signal enqueue work when in no work state");
 
 	/* At this point all the workers have stopped. */
@@ -460,8 +495,8 @@ sgen_workers_join (int generation)
 }
 
 /*
- * Can only be called if the workers are stopped.
- * If we're stopped, there are also no pending jobs.
+ * Can only be called if the workers are not working in the
+ * context and there are no pending jobs.
  */
 gboolean
 sgen_workers_have_idle_work (int generation)
@@ -469,8 +504,6 @@ sgen_workers_have_idle_work (int generation)
 	WorkerContext *context = &worker_contexts [generation];
 	int i;
 
-	SGEN_ASSERT (0, context->forced_stop && !sgen_workers_are_working (context), "Checking for idle work should only happen if the workers are stopped.");
-
 	if (!sgen_section_gray_queue_is_empty (&context->workers_distribute_gray_queue))
 		return TRUE;
 
-- 
2.25.1