[threadpool] Improve the monitor thread heuristic
authorLudovic Henry <ludovic@xamarin.com>
Tue, 9 Feb 2016 19:17:14 +0000 (19:17 +0000)
committerLudovic Henry <ludovic@xamarin.com>
Tue, 9 Feb 2016 19:55:05 +0000 (19:55 +0000)
Because the ThreadPool heuristic is optimized for short lived work item, it has more difficulty when executing long lived work item. That case can lead to a starvation of the worker threads: they are all doing work, while there is outstanding request, and these requests are not satisfied because we reached the max number of working thread.

To fix that issue, we have the monitor thread, whose sole job is to unstuck this kind of starvation cases. Unfortunately, it only works when all the worker threads are in the ThreadState.WaitSleepJoin state, which excludes the case of calling an IO operation in a Task. That includes the case of FileStream.BeginRead/Write/... which implementation can be simplified as follows: FileStream.BeginRead(...) -> Task.Run(() => FileStream.Read(...)).

The way we implement it in this commit is: every MONITOR_INTERVAL (500ms here), we check if there is any outstanding request, and if so, we assume that we are in the starvation case, and we simply increase the max number of working thread. Also to reduce the number of false positives, we do that only if there has been no completed work item for more than at least MONITOR_INTERVAL (in case of low CPU usage, more otherwise, see monitor_sufficient_delay_since_last_dequeue). This case is typically the case where we have all working thread stuck in long running work items.

Finally we increase the monitor interval from 100ms to 500ms so we guarantee we do not create more than 2 threads per second in the monitor thread. That is the value used by the old threadpool.

mono/metadata/threadpool-ms.c

index 7d9657c2fcc61f229b54dca8a45e1af6388ad667..82f9d75c83a871ba74c9c6f9a599f890e8a4590e 100644 (file)
@@ -46,7 +46,7 @@
 #define CPU_USAGE_LOW 80
 #define CPU_USAGE_HIGH 95
 
-#define MONITOR_INTERVAL 100 // ms
+#define MONITOR_INTERVAL 500 // ms
 #define MONITOR_MINIMAL_LIFETIME 60 * 1000 // ms
 
 #define WORKER_CREATION_MAX_PER_SEC 10
@@ -876,8 +876,8 @@ monitor_thread (void)
        mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, started", mono_native_thread_id_get ());
 
        do {
-               MonoInternalThread *thread;
-               gboolean all_waitsleepjoin = TRUE;
+               ThreadPoolCounter counter;
+               gboolean limit_worker_max_reached;
                gint32 interval_left = MONITOR_INTERVAL;
                gint32 awake = 0; /* number of spurious awakes we tolerate before doing a round of rebalancing */
 
@@ -918,49 +918,38 @@ monitor_thread (void)
                }
                mono_coop_mutex_unlock (&threadpool->domains_lock);
 
+               threadpool->cpu_usage = mono_cpu_usage (threadpool->cpu_usage_state);
 
-               mono_coop_mutex_lock (&threadpool->active_threads_lock);
-               for (i = 0; i < threadpool->working_threads->len; ++i) {
-                       thread = (MonoInternalThread *)g_ptr_array_index (threadpool->working_threads, i);
-                       if ((thread->state & ThreadState_WaitSleepJoin) == 0) {
-                               all_waitsleepjoin = FALSE;
-                               break;
-                       }
-               }
-               mono_coop_mutex_unlock (&threadpool->active_threads_lock);
+               if (!monitor_sufficient_delay_since_last_dequeue ())
+                       continue;
 
-               if (all_waitsleepjoin) {
-                       ThreadPoolCounter counter;
-                       gboolean limit_worker_max_reached = FALSE;
+               limit_worker_max_reached = FALSE;
 
-                       COUNTER_ATOMIC (counter, {
-                               if (counter._.max_working >= threadpool->limit_worker_max) {
-                                       limit_worker_max_reached = TRUE;
-                                       break;
-                               }
-                               counter._.max_working ++;
-                       });
+               COUNTER_ATOMIC (counter, {
+                       if (counter._.max_working >= threadpool->limit_worker_max) {
+                               limit_worker_max_reached = TRUE;
+                               break;
+                       }
+                       counter._.max_working ++;
+               });
 
-                       if (!limit_worker_max_reached)
-                               hill_climbing_force_change (counter._.max_working, TRANSITION_STARVATION);
-               }
+               if (limit_worker_max_reached)
+                       continue;
 
-               threadpool->cpu_usage = mono_cpu_usage (threadpool->cpu_usage_state);
+               hill_climbing_force_change (counter._.max_working, TRANSITION_STARVATION);
 
-               if (monitor_sufficient_delay_since_last_dequeue ()) {
-                       for (i = 0; i < 5; ++i) {
-                               if (mono_runtime_is_shutting_down ())
-                                       break;
+               for (i = 0; i < 5; ++i) {
+                       if (mono_runtime_is_shutting_down ())
+                               break;
 
-                               if (worker_try_unpark ()) {
-                                       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, unparked", mono_native_thread_id_get ());
-                                       break;
-                               }
+                       if (worker_try_unpark ()) {
+                               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, unparked", mono_native_thread_id_get ());
+                               break;
+                       }
 
-                               if (worker_try_create ()) {
-                                       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, created", mono_native_thread_id_get ());
-                                       break;
-                               }
+                       if (worker_try_create ()) {
+                               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, created", mono_native_thread_id_get ());
+                               break;
                        }
                }
        } while (monitor_should_keep_running ());