Fixes process names trimmed to 15 chars.
[mono.git] / mono / metadata / threadpool-ms.c
index 3bc2507a28ec6c2ba6032afb261692fa8de0d0bb..c1eb63b2d15529300d08faa91d79f246ff556d56 100644 (file)
 // Ported from C++ to C and adjusted to Mono runtime
 
 #include <stdlib.h>
+#define _USE_MATH_DEFINES // needed by MSVC to define math constants
 #include <math.h>
 #include <config.h>
 #include <glib.h>
 
-#if !defined (HAVE_COMPLEX_H)
-#include <../../support/libm/complex.h>
-#else
-#include <complex.h>
-#endif
-
 #include <mono/metadata/class-internals.h>
 #include <mono/metadata/exception.h>
 #include <mono/metadata/gc-internal.h>
 #include <mono/metadata/object-internals.h>
 #include <mono/metadata/threadpool-ms.h>
 #include <mono/metadata/threadpool-ms-io.h>
-#include <mono/metadata/threadpool-internals.h>
 #include <mono/utils/atomic.h>
 #include <mono/utils/mono-compiler.h>
+#include <mono/utils/mono-complex.h>
+#include <mono/utils/mono-lazy-init.h>
+#include <mono/utils/mono-logger.h>
+#include <mono/utils/mono-logger-internal.h>
 #include <mono/utils/mono-proclib.h>
 #include <mono/utils/mono-threads.h>
 #include <mono/utils/mono-time.h>
@@ -49,6 +47,7 @@
 #define CPU_USAGE_HIGH 95
 
 #define MONITOR_INTERVAL 100 // ms
+#define MONITOR_MINIMAL_LIFETIME 60 * 1000 // ms
 
 /* The exponent to apply to the gain. 1.0 means to use linear gain,
  * higher values will enhance large moves and damp small ones.
@@ -75,9 +74,9 @@
 typedef union {
        struct {
                gint16 max_working; /* determined by heuristic */
-               gint16 active; /* working or waiting on thread_work_sem; warm threads */
-               gint16 working;
-               gint16 parked;
+               gint16 active; /* executing worker_thread */
+               gint16 working; /* actively executing worker_thread, not parked */
+               gint16 parked; /* parked */
        } _;
        gint64 as_gint64;
 } ThreadPoolCounter;
@@ -87,6 +86,9 @@ typedef struct {
        gint32 outstanding_request;
 } ThreadPoolDomain;
 
+typedef MonoInternalThread ThreadPoolWorkingThread;
+typedef mono_cond_t ThreadPoolParkedThread;
+
 typedef struct {
        gint32 wave_period;
        gint32 samples_to_measure;
@@ -126,11 +128,9 @@ typedef struct {
        GPtrArray *domains; // ThreadPoolDomain* []
        mono_mutex_t domains_lock;
 
-       GPtrArray *working_threads; // MonoInternalThread* []
-       mono_mutex_t working_threads_lock;
-
-       GPtrArray *parked_threads; // mono_cond_t* []
-       mono_mutex_t parked_threads_lock;
+       GPtrArray *working_threads; // ThreadPoolWorkingThread* []
+       GPtrArray *parked_threads; // ThreadPoolParkedThread* []
+       mono_mutex_t active_threads_lock; /* protect access to working_threads and parked_threads */
 
        gint32 heuristic_completions;
        guint32 heuristic_sample_start;
@@ -164,13 +164,14 @@ typedef enum {
        TRANSITION_UNDEFINED,
 } ThreadPoolHeuristicStateTransition;
 
+static mono_lazy_init_t status = MONO_LAZY_INIT_STATUS_NOT_INITIALIZED;
+
 enum {
        MONITOR_STATUS_REQUESTED,
        MONITOR_STATUS_WAITING_FOR_REQUEST,
        MONITOR_STATUS_NOT_RUNNING,
 };
 
-static gint32 status = STATUS_NOT_INITIALIZED;
 static gint32 monitor_status = MONITOR_STATUS_NOT_RUNNING;
 
 static ThreadPool* threadpool;
@@ -178,17 +179,19 @@ static ThreadPool* threadpool;
 #define COUNTER_CHECK(counter) \
        do { \
                g_assert (counter._.max_working > 0); \
+               g_assert (counter._.working >= 0); \
                g_assert (counter._.active >= 0); \
        } while (0)
 
-#define COUNTER_READ() ((ThreadPoolCounter) InterlockedRead64 (&threadpool->counters.as_gint64))
+#define COUNTER_READ() (InterlockedRead64 (&threadpool->counters.as_gint64))
 
 #define COUNTER_ATOMIC(var,block) \
        do { \
                ThreadPoolCounter __old; \
                do { \
                        g_assert (threadpool); \
-                       (var) = __old = COUNTER_READ (); \
+                       __old.as_gint64 = COUNTER_READ (); \
+                       (var) = __old; \
                        { block; } \
                        COUNTER_CHECK (var); \
                } while (InterlockedCompareExchange64 (&threadpool->counters.as_gint64, (var).as_gint64, __old.as_gint64) != __old.as_gint64); \
@@ -199,7 +202,8 @@ static ThreadPool* threadpool;
                ThreadPoolCounter __old; \
                do { \
                        g_assert (threadpool); \
-                       (var) = __old = COUNTER_READ (); \
+                       __old.as_gint64 = COUNTER_READ (); \
+                       (var) = __old; \
                        (res) = FALSE; \
                        { block; } \
                        COUNTER_CHECK (var); \
@@ -232,27 +236,13 @@ rand_free (gpointer handle)
 }
 
 static void
-ensure_initialized (MonoBoolean *enable_worker_tracking)
+initialize (void)
 {
        ThreadPoolHillClimbing *hc;
        const char *threads_per_cpu_env;
        gint threads_per_cpu;
        gint threads_count;
 
-       if (enable_worker_tracking) {
-               // TODO implement some kind of switch to have the possibily to use it
-               *enable_worker_tracking = FALSE;
-       }
-
-       if (status >= STATUS_INITIALIZED)
-               return;
-       if (status == STATUS_INITIALIZING || InterlockedCompareExchange (&status, STATUS_INITIALIZING, STATUS_NOT_INITIALIZED) != STATUS_NOT_INITIALIZED) {
-               while (status == STATUS_INITIALIZING)
-                       mono_thread_info_yield ();
-               g_assert (status >= STATUS_INITIALIZED);
-               return;
-       }
-
        g_assert (!threadpool);
        threadpool = g_new0 (ThreadPool, 1);
        g_assert (threadpool);
@@ -261,10 +251,8 @@ ensure_initialized (MonoBoolean *enable_worker_tracking)
        mono_mutex_init_recursive (&threadpool->domains_lock);
 
        threadpool->parked_threads = g_ptr_array_new ();
-       mono_mutex_init (&threadpool->parked_threads_lock);
-
        threadpool->working_threads = g_ptr_array_new ();
-       mono_mutex_init (&threadpool->working_threads_lock);
+       mono_mutex_init (&threadpool->active_threads_lock);
 
        threadpool->heuristic_adjustment_interval = 10;
        mono_mutex_init (&threadpool->heuristic_lock);
@@ -313,82 +301,34 @@ ensure_initialized (MonoBoolean *enable_worker_tracking)
        threadpool->cpu_usage_state = g_new0 (MonoCpuUsageState, 1);
 
        threadpool->suspended = FALSE;
-
-       status = STATUS_INITIALIZED;
 }
 
+static void worker_unpark (ThreadPoolParkedThread *thread);
+static void worker_kill (ThreadPoolWorkingThread *thread);
+
 static void
-ensure_cleanedup (void)
+cleanup (void)
 {
-       if (status == STATUS_NOT_INITIALIZED && InterlockedCompareExchange (&status, STATUS_CLEANED_UP, STATUS_NOT_INITIALIZED) == STATUS_NOT_INITIALIZED)
-               return;
-       if (status == STATUS_INITIALIZING) {
-               while (status == STATUS_INITIALIZING)
-                       mono_thread_info_yield ();
-       }
-       if (status == STATUS_CLEANED_UP)
-               return;
-       if (status == STATUS_CLEANING_UP || InterlockedCompareExchange (&status, STATUS_CLEANING_UP, STATUS_INITIALIZED) != STATUS_INITIALIZED) {
-               while (status == STATUS_CLEANING_UP)
-                       mono_thread_info_yield ();
-               g_assert (status == STATUS_CLEANED_UP);
-               return;
-       }
+       guint i;
 
        /* we make the assumption along the code that we are
         * cleaning up only if the runtime is shutting down */
        g_assert (mono_runtime_is_shutting_down ());
 
-       /* Unpark all worker threads */
-       mono_mutex_lock (&threadpool->parked_threads_lock);
-       for (;;) {
-               guint i;
-               ThreadPoolCounter counter = COUNTER_READ ();
-               if (counter._.active == 0 && counter._.parked == 0)
-                       break;
-               if (counter._.active == 1) {
-                       MonoInternalThread *thread = mono_thread_internal_current ();
-                       if (thread->threadpool_thread) {
-                               /* if there is only one active thread
-                                * left and it's the current one */
-                               break;
-                       }
-               }
-               for (i = 0; i < threadpool->parked_threads->len; ++i) {
-                       mono_cond_t *cond = (mono_cond_t*) g_ptr_array_index (threadpool->parked_threads, i);
-                       mono_cond_signal (cond);
-               }
-               mono_mutex_unlock (&threadpool->parked_threads_lock);
-               usleep (1000);
-               mono_mutex_lock (&threadpool->parked_threads_lock);
-       }
-       mono_mutex_unlock (&threadpool->parked_threads_lock);
-
        while (monitor_status != MONITOR_STATUS_NOT_RUNNING)
-               usleep (1000);
-
-       g_ptr_array_free (threadpool->domains, TRUE);
-       mono_mutex_destroy (&threadpool->domains_lock);
+               g_usleep (1000);
 
-       g_ptr_array_free (threadpool->parked_threads, TRUE);
-       mono_mutex_destroy (&threadpool->parked_threads_lock);
+       mono_mutex_lock (&threadpool->active_threads_lock);
 
-       g_ptr_array_free (threadpool->working_threads, TRUE);
-       mono_mutex_destroy (&threadpool->working_threads_lock);
+       /* stop all threadpool->working_threads */
+       for (i = 0; i < threadpool->working_threads->len; ++i)
+               worker_kill ((ThreadPoolWorkingThread*) g_ptr_array_index (threadpool->working_threads, i));
 
-       mono_mutex_destroy (&threadpool->heuristic_lock);
-       g_free (threadpool->heuristic_hill_climbing.samples);
-       g_free (threadpool->heuristic_hill_climbing.thread_counts);
-       rand_free (threadpool->heuristic_hill_climbing.random_interval_generator);
+       /* unpark all threadpool->parked_threads */
+       for (i = 0; i < threadpool->parked_threads->len; ++i)
+               worker_unpark ((ThreadPoolParkedThread*) g_ptr_array_index (threadpool->parked_threads, i));
 
-       g_free (threadpool->cpu_usage_state);
-
-       g_assert (threadpool);
-       g_free (threadpool);
-       threadpool = NULL;
-       g_assert (!threadpool);
-
-       status = STATUS_CLEANED_UP;
+       mono_mutex_unlock (&threadpool->active_threads_lock);
 }
 
 void
@@ -461,7 +401,7 @@ domain_remove (ThreadPoolDomain *tpdomain)
 }
 
 static ThreadPoolDomain *
-domain_get_or_create (MonoDomain *domain)
+domain_get (MonoDomain *domain, gboolean create)
 {
        ThreadPoolDomain *tpdomain = NULL;
        guint i;
@@ -476,7 +416,7 @@ domain_get_or_create (MonoDomain *domain)
                        break;
                }
        }
-       if (!tpdomain) {
+       if (!tpdomain && create) {
                tpdomain = g_new0 (ThreadPoolDomain, 1);
                tpdomain->domain = domain;
                domain_add (tpdomain);
@@ -485,6 +425,12 @@ domain_get_or_create (MonoDomain *domain)
        return tpdomain;
 }
 
+static void
+domain_free (ThreadPoolDomain *tpdomain)
+{
+       g_free (tpdomain);
+}
+
 static gboolean
 domain_any_has_request (void)
 {
@@ -526,8 +472,6 @@ domain_get_next (ThreadPoolDomain *current)
                        ThreadPoolDomain *tmp = g_ptr_array_index (threadpool->domains, i % len);
                        if (tmp->outstanding_request > 0) {
                                tpdomain = tmp;
-                               tpdomain->outstanding_request --;
-                               g_assert (tpdomain->outstanding_request >= 0);
                                break;
                        }
                }
@@ -540,15 +484,33 @@ static void
 worker_park (void)
 {
        mono_cond_t cond;
+       MonoInternalThread *thread = mono_thread_internal_current ();
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] current worker parking", GetCurrentThreadId ());
+
        mono_cond_init (&cond, NULL);
 
-       mono_mutex_lock (&threadpool->parked_threads_lock);
-       g_ptr_array_add (threadpool->parked_threads, &cond);
-       mono_cond_wait (&cond, &threadpool->parked_threads_lock);
-       g_ptr_array_remove (threadpool->parked_threads, &cond);
-       mono_mutex_unlock (&threadpool->parked_threads_lock);
+       mono_gc_set_skip_thread (TRUE);
+
+       mono_mutex_lock (&threadpool->active_threads_lock);
+
+       if (!mono_runtime_is_shutting_down ()) {
+               g_ptr_array_add (threadpool->parked_threads, &cond);
+               g_ptr_array_remove_fast (threadpool->working_threads, thread);
+
+               mono_cond_wait (&cond, &threadpool->active_threads_lock);
+
+               g_ptr_array_add (threadpool->working_threads, thread);
+               g_ptr_array_remove (threadpool->parked_threads, &cond);
+       }
+
+       mono_mutex_unlock (&threadpool->active_threads_lock);
+
+       mono_gc_set_skip_thread (FALSE);
 
        mono_cond_destroy (&cond);
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] current worker unparking", GetCurrentThreadId ());
 }
 
 static gboolean
@@ -557,90 +519,121 @@ worker_try_unpark (void)
        gboolean res = FALSE;
        guint len;
 
-       mono_mutex_lock (&threadpool->parked_threads_lock);
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] try unpark worker", GetCurrentThreadId ());
+
+       mono_mutex_lock (&threadpool->active_threads_lock);
        len = threadpool->parked_threads->len;
        if (len > 0) {
                mono_cond_t *cond = (mono_cond_t*) g_ptr_array_index (threadpool->parked_threads, len - 1);
                mono_cond_signal (cond);
                res = TRUE;
        }
-       mono_mutex_unlock (&threadpool->parked_threads_lock);
+       mono_mutex_unlock (&threadpool->active_threads_lock);
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] try unpark worker, success? %s", GetCurrentThreadId (), res ? "yes" : "no");
+
        return res;
 }
 
+static void
+worker_unpark (ThreadPoolParkedThread *thread)
+{
+       mono_cond_signal ((mono_cond_t*) thread);
+}
+
+static void
+worker_kill (ThreadPoolWorkingThread *thread)
+{
+       if (thread == mono_thread_internal_current ())
+               return;
+
+       mono_thread_internal_stop ((MonoInternalThread*) thread);
+}
+
 static void
 worker_thread (gpointer data)
 {
-       static MonoClass *threadpool_wait_callback_class = NULL;
-       static MonoMethod *perform_wait_callback_method = NULL;
        MonoInternalThread *thread;
-       ThreadPoolDomain *tpdomain;
+       ThreadPoolDomain *tpdomain, *previous_tpdomain;
        ThreadPoolCounter counter;
        gboolean retire = FALSE;
 
-       g_assert (status >= STATUS_INITIALIZED);
-
-       tpdomain = data;
-       g_assert (tpdomain);
-       g_assert (tpdomain->domain);
-
-       if (mono_runtime_is_shutting_down () || mono_domain_is_unloading (tpdomain->domain)) {
-               COUNTER_ATOMIC (counter, { counter._.active --; });
-               return;
-       }
-
-       if (!threadpool_wait_callback_class)
-               threadpool_wait_callback_class = mono_class_from_name (mono_defaults.corlib, "System.Threading.Microsoft", "_ThreadPoolWaitCallback");
-       g_assert (threadpool_wait_callback_class);
-
-       if (!perform_wait_callback_method)
-               perform_wait_callback_method = mono_class_get_method_from_name (threadpool_wait_callback_class, "PerformWaitCallback", 0);
-       g_assert (perform_wait_callback_method);
+       mono_trace (G_LOG_LEVEL_INFO, MONO_TRACE_THREADPOOL, "[%p] worker starting", GetCurrentThreadId ());
 
        g_assert (threadpool);
 
        thread = mono_thread_internal_current ();
        g_assert (thread);
 
+       mono_thread_set_name_internal (thread, mono_string_new (mono_domain_get (), "Threadpool worker"), FALSE);
+
+       mono_mutex_lock (&threadpool->active_threads_lock);
+       g_ptr_array_add (threadpool->working_threads, thread);
+       mono_mutex_unlock (&threadpool->active_threads_lock);
+
+       previous_tpdomain = NULL;
+
        mono_mutex_lock (&threadpool->domains_lock);
 
-       do {
-               guint i, c;
+       while (!mono_runtime_is_shutting_down ()) {
+               tpdomain = NULL;
 
-               g_assert (tpdomain);
-               g_assert (tpdomain->domain);
+               if ((thread->state & (ThreadState_StopRequested | ThreadState_SuspendRequested)) != 0) {
+                       mono_mutex_unlock (&threadpool->domains_lock);
+                       mono_thread_interruption_checkpoint ();
+                       mono_mutex_lock (&threadpool->domains_lock);
+               }
 
-               tpdomain->domain->threadpool_jobs ++;
+               if (retire || !(tpdomain = domain_get_next (previous_tpdomain))) {
+                       COUNTER_ATOMIC (counter, {
+                               counter._.working --;
+                               counter._.parked ++;
+                       });
 
-               mono_mutex_unlock (&threadpool->domains_lock);
+                       mono_mutex_unlock (&threadpool->domains_lock);
+                       worker_park ();
+                       mono_mutex_lock (&threadpool->domains_lock);
 
-               mono_mutex_lock (&threadpool->working_threads_lock);
-               g_ptr_array_add (threadpool->working_threads, thread);
-               mono_mutex_unlock (&threadpool->working_threads_lock);
+                       COUNTER_ATOMIC (counter, {
+                               counter._.working ++;
+                               counter._.parked --;
+                       });
+
+                       if (retire)
+                               retire = FALSE;
+
+                       continue;
+               }
 
-               COUNTER_ATOMIC (counter, { counter._.working ++; });
+               tpdomain->outstanding_request --;
+               g_assert (tpdomain->outstanding_request >= 0);
+
+               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] worker running in domain %p",
+                       GetCurrentThreadId (), tpdomain->domain, tpdomain->outstanding_request);
+
+               g_assert (tpdomain->domain);
+               g_assert (tpdomain->domain->threadpool_jobs >= 0);
+               tpdomain->domain->threadpool_jobs ++;
+
+               mono_mutex_unlock (&threadpool->domains_lock);
 
                mono_thread_push_appdomain_ref (tpdomain->domain);
                if (mono_domain_set (tpdomain->domain, FALSE)) {
                        MonoObject *exc = NULL;
-                       MonoObject *res = mono_runtime_invoke (perform_wait_callback_method, NULL, NULL, &exc);
+                       MonoObject *res = mono_runtime_invoke (mono_defaults.threadpool_perform_wait_callback_method, NULL, NULL, &exc);
                        if (exc)
-                               mono_internal_thread_unhandled_exception (exc);
+                               mono_thread_internal_unhandled_exception (exc);
                        else if (res && *(MonoBoolean*) mono_object_unbox (res) == FALSE)
                                retire = TRUE;
 
                        mono_thread_clr_state (thread , ~ThreadState_Background);
                        if (!mono_thread_test_state (thread , ThreadState_Background))
                                ves_icall_System_Threading_Thread_SetState (thread, ThreadState_Background);
+
+                       mono_domain_set (mono_get_root_domain (), TRUE);
                }
                mono_thread_pop_appdomain_ref ();
 
-               COUNTER_ATOMIC (counter, { counter._.working --; });
-
-               mono_mutex_lock (&threadpool->working_threads_lock);
-               g_ptr_array_remove_fast (threadpool->working_threads, thread);
-               mono_mutex_unlock (&threadpool->working_threads_lock);
-
                mono_mutex_lock (&threadpool->domains_lock);
 
                tpdomain->domain->threadpool_jobs --;
@@ -651,62 +644,56 @@ worker_thread (gpointer data)
                        g_assert (removed);
                        if (tpdomain->domain->cleanup_semaphore)
                                ReleaseSemaphore (tpdomain->domain->cleanup_semaphore, 1, NULL);
-                       g_free (tpdomain);
+                       domain_free (tpdomain);
                        tpdomain = NULL;
                }
 
-               for (i = 0, c = 5; i < c; ++i) {
-                       if (mono_runtime_is_shutting_down ())
-                               break;
-
-                       if (!retire) {
-                               tpdomain = domain_get_next (tpdomain);
-                               if (tpdomain)
-                                       break;
-                       }
+               previous_tpdomain = tpdomain;
+       }
 
-                       if (i < c - 1) {
-                               gboolean park = TRUE;
-
-                               COUNTER_ATOMIC (counter, {
-                                       if (counter._.active <= counter._.max_working) {
-                                               park = FALSE;
-                                               break;
-                                       }
-                                       counter._.active --;
-                                       counter._.parked ++;
-                               });
-
-                               if (park) {
-                                       mono_mutex_unlock (&threadpool->domains_lock);
-                                       mono_gc_set_skip_thread (TRUE);
-                                       worker_park ();
-                                       mono_gc_set_skip_thread (FALSE);
-                                       mono_mutex_lock (&threadpool->domains_lock);
-
-                                       COUNTER_ATOMIC (counter, {
-                                               counter._.active ++;
-                                               counter._.parked --;
-                                       });
-                               }
-                       }
+       mono_mutex_unlock (&threadpool->domains_lock);
 
-                       retire = FALSE;
-               }
-       } while (tpdomain && !mono_runtime_is_shutting_down ());
+       mono_mutex_lock (&threadpool->active_threads_lock);
+       g_ptr_array_remove_fast (threadpool->working_threads, thread);
+       mono_mutex_unlock (&threadpool->active_threads_lock);
 
-       mono_mutex_unlock (&threadpool->domains_lock);
+       COUNTER_ATOMIC (counter, {
+               counter._.working--;
+               counter._.active --;
+       });
 
-       COUNTER_ATOMIC (counter, { counter._.active --; });
+       mono_trace (G_LOG_LEVEL_INFO, MONO_TRACE_THREADPOOL, "[%p] worker finishing", GetCurrentThreadId ());
 }
 
 static gboolean
-worker_try_create (ThreadPoolDomain *tpdomain)
+worker_try_create (void)
 {
-       g_assert (tpdomain);
-       g_assert (tpdomain->domain);
+       ThreadPoolCounter counter;
+       MonoInternalThread *thread;
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] try create worker", GetCurrentThreadId ());
+
+       COUNTER_ATOMIC (counter, {
+               if (counter._.working >= counter._.max_working)
+                       return FALSE;
+               counter._.working ++;
+               counter._.active ++;
+       });
 
-       return mono_thread_create_internal (tpdomain->domain, worker_thread, tpdomain, TRUE, 0) != NULL;
+       if ((thread = mono_thread_create_internal (mono_get_root_domain (), worker_thread, NULL, TRUE, 0)) != NULL) {
+               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] try create worker, created %p",
+                       GetCurrentThreadId (), thread->tid);
+               return TRUE;
+       }
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] try create worker, failed", GetCurrentThreadId ());
+
+       COUNTER_ATOMIC (counter, {
+               counter._.working --;
+               counter._.active --;
+       });
+
+       return FALSE;
 }
 
 static void monitor_ensure_running (void);
@@ -715,18 +702,28 @@ static gboolean
 worker_request (MonoDomain *domain)
 {
        ThreadPoolDomain *tpdomain;
-       ThreadPoolCounter counter;
 
        g_assert (domain);
        g_assert (threadpool);
 
-       if (mono_runtime_is_shutting_down () || mono_domain_is_unloading (domain))
+       if (mono_runtime_is_shutting_down ())
                return FALSE;
 
        mono_mutex_lock (&threadpool->domains_lock);
-       tpdomain = domain_get_or_create (domain);
+
+       /* synchronize check with worker_thread */
+       if (mono_domain_is_unloading (domain)) {
+               mono_mutex_unlock (&threadpool->domains_lock);
+               return FALSE;
+       }
+
+       tpdomain = domain_get (domain, TRUE);
        g_assert (tpdomain);
        tpdomain->outstanding_request ++;
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] request worker, domain = %p, outstanding_request = %d",
+               GetCurrentThreadId (), tpdomain->domain, tpdomain->outstanding_request);
+
        mono_mutex_unlock (&threadpool->domains_lock);
 
        if (threadpool->suspended)
@@ -734,29 +731,48 @@ worker_request (MonoDomain *domain)
 
        monitor_ensure_running ();
 
-       if (worker_try_unpark ())
+       if (worker_try_unpark ()) {
+               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] request worker, unparked", GetCurrentThreadId ());
                return TRUE;
+       }
 
-       COUNTER_ATOMIC (counter, {
-               if (counter._.active >= counter._.max_working)
-                       return FALSE;
-               counter._.active ++;
-       });
-
-       if (worker_try_create (tpdomain))
+       if (worker_try_create ()) {
+               mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] request worker, created", GetCurrentThreadId ());
                return TRUE;
+       }
 
-       COUNTER_ATOMIC (counter, { counter._.active --; });
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] request worker, failed", GetCurrentThreadId ());
        return FALSE;
 }
 
 static gboolean
 monitor_should_keep_running (void)
 {
+       static gint64 last_should_keep_running = -1;
+
        g_assert (monitor_status == MONITOR_STATUS_WAITING_FOR_REQUEST || monitor_status == MONITOR_STATUS_REQUESTED);
 
        if (InterlockedExchange (&monitor_status, MONITOR_STATUS_WAITING_FOR_REQUEST) == MONITOR_STATUS_WAITING_FOR_REQUEST) {
-               if (mono_runtime_is_shutting_down () || !domain_any_has_request ()) {
+               gboolean should_keep_running = TRUE, force_should_keep_running = FALSE;
+
+               if (mono_runtime_is_shutting_down ()) {
+                       should_keep_running = FALSE;
+               } else {
+                       if (!domain_any_has_request ())
+                               should_keep_running = FALSE;
+
+                       if (!should_keep_running) {
+                               if (last_should_keep_running == -1 || mono_100ns_ticks () - last_should_keep_running < MONITOR_MINIMAL_LIFETIME * 1000 * 10) {
+                                       should_keep_running = force_should_keep_running = TRUE;
+                               }
+                       }
+               }
+
+               if (should_keep_running) {
+                       if (last_should_keep_running == -1 || !force_should_keep_running)
+                               last_should_keep_running = mono_100ns_ticks ();
+               } else {
+                       last_should_keep_running = -1;
                        if (InterlockedCompareExchange (&monitor_status, MONITOR_STATUS_NOT_RUNNING, MONITOR_STATUS_WAITING_FOR_REQUEST) == MONITOR_STATUS_WAITING_FOR_REQUEST)
                                return FALSE;
                }
@@ -777,7 +793,8 @@ monitor_sufficient_delay_since_last_dequeue (void)
        if (threadpool->cpu_usage < CPU_USAGE_LOW) {
                threshold = MONITOR_INTERVAL;
        } else {
-               ThreadPoolCounter counter = COUNTER_READ ();
+               ThreadPoolCounter counter;
+               counter.as_gint64 = COUNTER_READ();
                threshold = counter._.max_working * MONITOR_INTERVAL * 2;
        }
 
@@ -794,6 +811,8 @@ monitor_thread (void)
 
        mono_cpu_usage (threadpool->cpu_usage_state);
 
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, started", GetCurrentThreadId ());
+
        do {
                MonoInternalThread *thread;
                gboolean all_waitsleepjoin = TRUE;
@@ -815,8 +834,10 @@ monitor_thread (void)
                                break;
                        interval_left -= mono_msec_ticks () - ts;
 
+                       mono_gc_set_skip_thread (FALSE);
                        if ((current_thread->state & (ThreadState_StopRequested | ThreadState_SuspendRequested)) != 0)
                                mono_thread_interruption_checkpoint ();
+                       mono_gc_set_skip_thread (TRUE);
                } while (interval_left > 0 && ++awake < 10);
 
                mono_gc_set_skip_thread (FALSE);
@@ -827,7 +848,7 @@ monitor_thread (void)
                if (mono_runtime_is_shutting_down () || !domain_any_has_request ())
                        continue;
 
-               mono_mutex_lock (&threadpool->working_threads_lock);
+               mono_mutex_lock (&threadpool->active_threads_lock);
                for (i = 0; i < threadpool->working_threads->len; ++i) {
                        thread = g_ptr_array_index (threadpool->working_threads, i);
                        if ((thread->state & ThreadState_WaitSleepJoin) == 0) {
@@ -835,7 +856,7 @@ monitor_thread (void)
                                break;
                        }
                }
-               mono_mutex_unlock (&threadpool->working_threads_lock);
+               mono_mutex_unlock (&threadpool->active_threads_lock);
 
                if (all_waitsleepjoin) {
                        ThreadPoolCounter counter;
@@ -847,33 +868,23 @@ monitor_thread (void)
 
                if (monitor_sufficient_delay_since_last_dequeue ()) {
                        for (i = 0; i < 5; ++i) {
-                               ThreadPoolDomain *tpdomain;
-                               ThreadPoolCounter counter;
-                               gboolean success;
-
                                if (mono_runtime_is_shutting_down ())
                                        break;
 
-                               if (worker_try_unpark ())
+                               if (worker_try_unpark ()) {
+                                       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, unparked", GetCurrentThreadId ());
                                        break;
+                               }
 
-                               COUNTER_TRY_ATOMIC (success, counter, {
-                                       if (counter._.active >= counter._.max_working)
-                                               break;
-                                       counter._.active ++;
-                               });
-
-                               if (!success)
-                                       continue;
-
-                               tpdomain = domain_get_next (NULL);
-                               if (tpdomain && worker_try_create (tpdomain))
+                               if (worker_try_create ()) {
+                                       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, created", GetCurrentThreadId ());
                                        break;
-
-                               COUNTER_ATOMIC (counter, { counter._.active --; });
+                               }
                        }
                }
        } while (monitor_should_keep_running ());
+
+       mono_trace (G_LOG_LEVEL_DEBUG, MONO_TRACE_THREADPOOL, "[%p] monitor thread, finished", GetCurrentThreadId ());
 }
 
 static void
@@ -887,6 +898,8 @@ monitor_ensure_running (void)
                        InterlockedCompareExchange (&monitor_status, MONITOR_STATUS_REQUESTED, MONITOR_STATUS_WAITING_FOR_REQUEST);
                        break;
                case MONITOR_STATUS_NOT_RUNNING:
+                       if (mono_runtime_is_shutting_down ())
+                               return;
                        if (InterlockedCompareExchange (&monitor_status, MONITOR_STATUS_REQUESTED, MONITOR_STATUS_NOT_RUNNING) == MONITOR_STATUS_NOT_RUNNING) {
                                if (!mono_thread_create_internal (mono_get_root_domain (), monitor_thread, NULL, TRUE, SMALL_STACK))
                                        monitor_status = MONITOR_STATUS_NOT_RUNNING;
@@ -907,6 +920,8 @@ hill_climbing_change_thread_count (gint16 new_thread_count, ThreadPoolHeuristicS
 
        hc = &threadpool->heuristic_hill_climbing;
 
+       mono_trace (G_LOG_LEVEL_INFO, MONO_TRACE_THREADPOOL, "[%p] hill climbing, change max number of threads %d", GetCurrentThreadId (), new_thread_count);
+
        hc->last_thread_count = new_thread_count;
        hc->current_sample_interval = rand_next (&hc->random_interval_generator, hc->sample_interval_low, hc->sample_interval_high);
        hc->elapsed_since_last_change = 0;
@@ -928,7 +943,7 @@ hill_climbing_force_change (gint16 new_thread_count, ThreadPoolHeuristicStateTra
        }
 }
 
-static double complex
+static double_complex
 hill_climbing_get_wave_component (gdouble *samples, guint sample_count, gdouble period)
 {
        ThreadPoolHillClimbing *hc;
@@ -953,7 +968,7 @@ hill_climbing_get_wave_component (gdouble *samples, guint sample_count, gdouble
                q1 = q0;
        }
 
-       return ((q1 - q2 * cosine) + (q2 * sine) * I) / ((gdouble) sample_count);
+       return mono_double_complex_scalar_div (mono_double_complex_make (q1 - q2 * cosine, (q2 * sine)), ((gdouble)sample_count));
 }
 
 static gint16
@@ -970,9 +985,9 @@ hill_climbing_update (gint16 current_thread_count, guint32 sample_duration, gint
        gint sample_count;
        gint new_thread_wave_magnitude;
        gint new_thread_count;
-       double complex thread_wave_component;
-       double complex throughput_wave_component;
-       double complex ratio;
+       double_complex thread_wave_component;
+       double_complex throughput_wave_component;
+       double_complex ratio;
 
        g_assert (threadpool);
        g_assert (adjustment_interval);
@@ -1032,10 +1047,10 @@ hill_climbing_update (gint16 current_thread_count, guint32 sample_duration, gint
        hc->total_samples ++;
 
        /* Set up defaults for our metrics. */
-       thread_wave_component = 0;
-       throughput_wave_component = 0;
+       thread_wave_component = mono_double_complex_make(0, 0);
+       throughput_wave_component = mono_double_complex_make(0, 0);
        throughput_error_estimate = 0;
-       ratio = 0;
+       ratio = mono_double_complex_make(0, 0);
        confidence = 0;
 
        transition = TRANSITION_WARMUP;
@@ -1073,17 +1088,17 @@ hill_climbing_update (gint16 current_thread_count, guint32 sample_duration, gint
                        /* Get the the three different frequency components of the throughput (scaled by average
                         * throughput). Our "error" estimate (the amount of noise that might be present in the
                         * frequency band we're really interested in) is the average of the adjacent bands. */
-                       throughput_wave_component = hill_climbing_get_wave_component (hc->samples, sample_count, hc->wave_period) / average_throughput;
-                       throughput_error_estimate = cabs (hill_climbing_get_wave_component (hc->samples, sample_count, adjacent_period_1) / average_throughput);
+                       throughput_wave_component = mono_double_complex_scalar_div (hill_climbing_get_wave_component (hc->samples, sample_count, hc->wave_period), average_throughput);
+                       throughput_error_estimate = cabs (mono_double_complex_scalar_div (hill_climbing_get_wave_component (hc->samples, sample_count, adjacent_period_1), average_throughput));
 
                        if (adjacent_period_2 <= sample_count) {
-                               throughput_error_estimate = MAX (throughput_error_estimate, cabs (hill_climbing_get_wave_component (
-                                       hc->samples, sample_count, adjacent_period_2) / average_throughput));
+                               throughput_error_estimate = MAX (throughput_error_estimate, cabs (mono_double_complex_scalar_div (hill_climbing_get_wave_component (
+                                       hc->samples, sample_count, adjacent_period_2), average_throughput)));
                        }
 
                        /* Do the same for the thread counts, so we have something to compare to. We don't
                         * measure thread count noise, because there is none; these are exact measurements. */
-                       thread_wave_component = hill_climbing_get_wave_component (hc->thread_counts, sample_count, hc->wave_period) / average_thread_count;
+                       thread_wave_component = mono_double_complex_scalar_div (hill_climbing_get_wave_component (hc->thread_counts, sample_count, hc->wave_period), average_thread_count);
 
                        /* Update our moving average of the throughput noise. We'll use this
                         * later as feedback to determine the new size of the thread wave. */
@@ -1097,10 +1112,10 @@ hill_climbing_update (gint16 current_thread_count, guint32 sample_duration, gint
                        if (cabs (thread_wave_component) > 0) {
                                /* Adjust the throughput wave so it's centered around the target wave,
                                 * and then calculate the adjusted throughput/thread ratio. */
-                               ratio = (throughput_wave_component - (hc->target_throughput_ratio * thread_wave_component)) / thread_wave_component;
+                               ratio = mono_double_complex_div (mono_double_complex_sub (throughput_wave_component, mono_double_complex_scalar_mul(thread_wave_component, hc->target_throughput_ratio)), thread_wave_component);
                                transition = TRANSITION_CLIMBING_MOVE;
                        } else {
-                               ratio = 0;
+                               ratio = mono_double_complex_make (0, 0);
                                transition = TRANSITION_STABILIZING;
                        }
 
@@ -1181,8 +1196,9 @@ heuristic_should_adjust (void)
        g_assert (threadpool);
 
        if (threadpool->heuristic_last_dequeue > threadpool->heuristic_last_adjustment + threadpool->heuristic_adjustment_interval) {
-               ThreadPoolCounter counter = COUNTER_READ ();
-               if (counter._.active <= counter._.max_working)
+               ThreadPoolCounter counter;
+               counter.as_gint64 = COUNTER_READ();
+               if (counter._.working <= counter._.max_working)
                        return TRUE;
        }
 
@@ -1203,7 +1219,7 @@ heuristic_adjust (void)
                        ThreadPoolCounter counter;
                        gint16 new_thread_count;
 
-                       counter = COUNTER_READ ();
+                       counter.as_gint64 = COUNTER_READ ();
                        new_thread_count = hill_climbing_update (counter._.max_working, sample_duration, completions, &threadpool->heuristic_adjustment_interval);
 
                        COUNTER_ATOMIC (counter, { counter._.max_working = new_thread_count; });
@@ -1222,10 +1238,10 @@ heuristic_adjust (void)
 void
 mono_threadpool_ms_cleanup (void)
 {
-#ifndef DISABLE_SOCKETS
-       mono_threadpool_ms_io_cleanup ();
-#endif
-       ensure_cleanedup ();
+       #ifndef DISABLE_SOCKETS
+               mono_threadpool_ms_io_cleanup ();
+       #endif
+       mono_lazy_cleanup (&status, cleanup);
 }
 
 MonoAsyncResult *
@@ -1242,7 +1258,7 @@ mono_threadpool_ms_begin_invoke (MonoDomain *domain, MonoObject *target, MonoMet
                async_call_klass = mono_class_from_name (mono_defaults.corlib, "System", "MonoAsyncCall");
        g_assert (async_call_klass);
 
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        message = mono_method_call_message_new (method, params, mono_get_delegate_invoke (method->klass), (params != NULL) ? (&async_callback) : NULL, (params != NULL) ? (&state) : NULL);
 
@@ -1326,6 +1342,8 @@ mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
        g_assert (domain);
        g_assert (timeout >= -1);
 
+       g_assert (mono_domain_is_unloading (domain));
+
        if (timeout != -1)
                start = mono_msec_ticks ();
 
@@ -1337,6 +1355,7 @@ mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
                        return FALSE;
        }
 #endif
+
        /*
         * There might be some threads out that could be about to execute stuff from the given domain.
         * We avoid that by setting up a semaphore to be pulsed by the thread that reaches zero.
@@ -1372,55 +1391,57 @@ mono_threadpool_ms_remove_domain_jobs (MonoDomain *domain, int timeout)
 void
 mono_threadpool_ms_suspend (void)
 {
-       threadpool->suspended = TRUE;
+       if (threadpool)
+               threadpool->suspended = TRUE;
 }
 
 void
 mono_threadpool_ms_resume (void)
 {
-       threadpool->suspended = FALSE;
+       if (threadpool)
+               threadpool->suspended = FALSE;
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_GetAvailableThreadsNative (gint *worker_threads, gint *completion_port_threads)
+ves_icall_System_Threading_ThreadPool_GetAvailableThreadsNative (gint32 *worker_threads, gint32 *completion_port_threads)
 {
        if (!worker_threads || !completion_port_threads)
                return;
 
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        *worker_threads = threadpool->limit_worker_max;
        *completion_port_threads = threadpool->limit_io_max;
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_GetMinThreadsNative (gint *worker_threads, gint *completion_port_threads)
+ves_icall_System_Threading_ThreadPool_GetMinThreadsNative (gint32 *worker_threads, gint32 *completion_port_threads)
 {
        if (!worker_threads || !completion_port_threads)
                return;
 
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        *worker_threads = threadpool->limit_worker_min;
        *completion_port_threads = threadpool->limit_io_min;
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_GetMaxThreadsNative (gint *worker_threads, gint *completion_port_threads)
+ves_icall_System_Threading_ThreadPool_GetMaxThreadsNative (gint32 *worker_threads, gint32 *completion_port_threads)
 {
        if (!worker_threads || !completion_port_threads)
                return;
 
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        *worker_threads = threadpool->limit_worker_max;
        *completion_port_threads = threadpool->limit_io_max;
 }
 
 MonoBoolean
-ves_icall_System_Threading_Microsoft_ThreadPool_SetMinThreadsNative (gint worker_threads, gint completion_port_threads)
+ves_icall_System_Threading_ThreadPool_SetMinThreadsNative (gint32 worker_threads, gint32 completion_port_threads)
 {
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        if (worker_threads <= 0 || worker_threads > threadpool->limit_worker_max)
                return FALSE;
@@ -1434,11 +1455,11 @@ ves_icall_System_Threading_Microsoft_ThreadPool_SetMinThreadsNative (gint worker
 }
 
 MonoBoolean
-ves_icall_System_Threading_Microsoft_ThreadPool_SetMaxThreadsNative (gint worker_threads, gint completion_port_threads)
+ves_icall_System_Threading_ThreadPool_SetMaxThreadsNative (gint32 worker_threads, gint32 completion_port_threads)
 {
        gint cpu_count = mono_cpu_count ();
 
-       ensure_initialized (NULL);
+       mono_lazy_initialize (&status, initialize);
 
        if (worker_threads < threadpool->limit_worker_min || worker_threads < cpu_count)
                return FALSE;
@@ -1452,13 +1473,18 @@ ves_icall_System_Threading_Microsoft_ThreadPool_SetMaxThreadsNative (gint worker
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_InitializeVMTp (MonoBoolean *enable_worker_tracking)
+ves_icall_System_Threading_ThreadPool_InitializeVMTp (MonoBoolean *enable_worker_tracking)
 {
-       ensure_initialized (enable_worker_tracking);
+       if (enable_worker_tracking) {
+               // TODO implement some kind of switch to have the possibily to use it
+               *enable_worker_tracking = FALSE;
+       }
+
+       mono_lazy_initialize (&status, initialize);
 }
 
 MonoBoolean
-ves_icall_System_Threading_Microsoft_ThreadPool_NotifyWorkItemComplete (void)
+ves_icall_System_Threading_ThreadPool_NotifyWorkItemComplete (void)
 {
        ThreadPoolCounter counter;
 
@@ -1470,12 +1496,12 @@ ves_icall_System_Threading_Microsoft_ThreadPool_NotifyWorkItemComplete (void)
        if (heuristic_should_adjust ())
                heuristic_adjust ();
 
-       counter = COUNTER_READ ();
-       return counter._.active <= counter._.max_working;
+       counter.as_gint64 = COUNTER_READ ();
+       return counter._.working <= counter._.max_working;
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_NotifyWorkItemProgressNative (void)
+ves_icall_System_Threading_ThreadPool_NotifyWorkItemProgressNative (void)
 {
        heuristic_notify_work_completed ();
 
@@ -1484,20 +1510,20 @@ ves_icall_System_Threading_Microsoft_ThreadPool_NotifyWorkItemProgressNative (vo
 }
 
 void
-ves_icall_System_Threading_Microsoft_ThreadPool_ReportThreadStatus (MonoBoolean is_working)
+ves_icall_System_Threading_ThreadPool_ReportThreadStatus (MonoBoolean is_working)
 {
        // TODO
        mono_raise_exception (mono_get_exception_not_implemented (NULL));
 }
 
 MonoBoolean
-ves_icall_System_Threading_Microsoft_ThreadPool_RequestWorkerThread (void)
+ves_icall_System_Threading_ThreadPool_RequestWorkerThread (void)
 {
        return worker_request (mono_domain_get ());
 }
 
 MonoBoolean G_GNUC_UNUSED
-ves_icall_System_Threading_Microsoft_ThreadPool_PostQueuedCompletionStatus (MonoNativeOverlapped *native_overlapped)
+ves_icall_System_Threading_ThreadPool_PostQueuedCompletionStatus (MonoNativeOverlapped *native_overlapped)
 {
        /* This copy the behavior of the current Mono implementation */
        mono_raise_exception (mono_get_exception_not_implemented (NULL));
@@ -1505,14 +1531,14 @@ ves_icall_System_Threading_Microsoft_ThreadPool_PostQueuedCompletionStatus (Mono
 }
 
 MonoBoolean G_GNUC_UNUSED
-ves_icall_System_Threading_Microsoft_ThreadPool_BindIOCompletionCallbackNative (gpointer file_handle)
+ves_icall_System_Threading_ThreadPool_BindIOCompletionCallbackNative (gpointer file_handle)
 {
        /* This copy the behavior of the current Mono implementation */
        return TRUE;
 }
 
 MonoBoolean G_GNUC_UNUSED
-ves_icall_System_Threading_Microsoft_ThreadPool_IsThreadPoolHosted (void)
+ves_icall_System_Threading_ThreadPool_IsThreadPoolHosted (void)
 {
        return FALSE;
 }