[MACH] Add retry logic to all mach api calls in the case of interruption.
authorRodrigo Kumpera <kumpera@gmail.com>
Thu, 14 Apr 2016 00:04:05 +0000 (17:04 -0700)
committerRodrigo Kumpera <kumpera@gmail.com>
Thu, 14 Apr 2016 17:25:41 +0000 (10:25 -0700)
One overlooked aspect of mach is that its calls are interruptible and
we never really accounted for that.

This problem turned up when sampling at high frequency as they would
start to randomly fail.

mono/metadata/mono-perfcounters.c
mono/metadata/sgen-os-mach.c
mono/mini/aot-runtime.c
mono/mini/mini-darwin.c
mono/mini/mini-posix.c
mono/utils/mach-support.c
mono/utils/mono-os-semaphore.h
mono/utils/mono-proclib.c
mono/utils/mono-threads-mach-abort-syscall.c
mono/utils/mono-threads-mach.c

index 445df5c8334ef7931c3c373c82a5168d71d8842b..e7148a823f68f457bcaebb95496b4d1114a35f7c 100644 (file)
@@ -502,7 +502,12 @@ mono_determine_physical_ram_available_size (void)
        mach_port_t host = mach_host_self();
        vm_size_t page_size;
        vm_statistics_data_t vmstat;
-       if (KERN_SUCCESS != host_statistics(host, HOST_VM_INFO, (host_info_t)&vmstat, &count)) {
+       kern_return_t ret;
+       do {
+               ret = host_statistics(host, HOST_VM_INFO, (host_info_t)&vmstat, &count);
+       } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS) {
                g_warning ("Mono was unable to retrieve memory usage!");
                return 0;
        }
index 10ac567111ec81b022d058662e3dbde0ec612375..1c45e375af5dffb372cc6b0338189ecee7e96d82 100644 (file)
 gboolean
 sgen_resume_thread (SgenThreadInfo *info)
 {
-       return thread_resume (info->client_info.info.native_handle) == KERN_SUCCESS;
+       kern_return_t ret;
+       do {
+               ret = thread_resume (info->client_info.info.native_handle);
+       } while (ret == KERN_ABORTED);
+       return ret == KERN_SUCCESS;
 }
 
 gboolean
@@ -51,11 +55,15 @@ sgen_suspend_thread (SgenThreadInfo *info)
        state = (thread_state_t) alloca (mono_mach_arch_get_thread_state_size ());
        mctx = (mcontext_t) alloca (mono_mach_arch_get_mcontext_size ());
 
-       ret = thread_suspend (info->client_info.info.native_handle);
+       do {
+               ret = thread_suspend (info->client_info.info.native_handle);
+       } while (ret == KERN_ABORTED);
        if (ret != KERN_SUCCESS)
                return FALSE;
 
-       ret = mono_mach_arch_get_thread_state (info->client_info.info.native_handle, state, &num_state);
+       do {
+               ret = mono_mach_arch_get_thread_state (info->client_info.info.native_handle, state, &num_state);
+       } while (ret == KERN_ABORTED);
        if (ret != KERN_SUCCESS)
                return FALSE;
 
@@ -117,7 +125,9 @@ sgen_thread_handshake (BOOL suspend)
                        if (!sgen_suspend_thread (info))
                                continue;
                } else {
-                       ret = thread_resume (info->client_info.info.native_handle);
+                       do {
+                               ret = thread_resume (info->client_info.info.native_handle);
+                       } while (ret == KERN_ABORTED);
                        if (ret != KERN_SUCCESS)
                                continue;
                }
index 2d789a2f1767d43ecfcbce86822b9ad78132ea56..d3f3cf2e952c82b946e69b7a834082609d318d2e 100644 (file)
@@ -5222,7 +5222,9 @@ get_new_trampoline_from_page (int tramp_type)
                /* allocate two contiguous pages of memory: the first page will contain the data (like a local constant pool)
                 * while the second will contain the trampolines.
                 */
-               ret = vm_allocate (mach_task_self (), &addr, psize * 2, VM_FLAGS_ANYWHERE);
+               do {
+                       ret = vm_allocate (mach_task_self (), &addr, psize * 2, VM_FLAGS_ANYWHERE);
+               } while (ret == KERN_ABORTED);
                if (ret != KERN_SUCCESS) {
                        g_error ("Cannot allocate memory for trampolines: %d", ret);
                        break;
index c2907a9b494a8c4b5d879105aab45e05bea564f3..a52f5eb80d06aef3137d018ac67b0940961f8460 100644 (file)
@@ -185,7 +185,9 @@ mono_thread_state_init_from_handle (MonoThreadUnwindState *tctx, MonoThreadInfo
        state = (thread_state_t) alloca (mono_mach_arch_get_thread_state_size ());
        mctx = (mcontext_t) alloca (mono_mach_arch_get_mcontext_size ());
 
-       ret = mono_mach_arch_get_thread_state (info->native_handle, state, &num_state);
+       do {
+               ret = mono_mach_arch_get_thread_state (info->native_handle, state, &num_state);
+       } while (ret == KERN_ABORTED);
        if (ret != KERN_SUCCESS)
                return FALSE;
 
index d5787acf13b6dff1e2468d5002302a1aca978a16..8e6646cc77c99efc1c45680045bef393c5bbc337 100644 (file)
@@ -517,7 +517,11 @@ clock_init (void)
 {
        kern_return_t ret;
 
-       if ((ret = host_get_clock_service (mach_host_self (), SYSTEM_CLOCK, &sampling_clock_service)) != KERN_SUCCESS)
+       do {
+               ret = host_get_clock_service (mach_host_self (), SYSTEM_CLOCK, &sampling_clock_service);
+       } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS)
                g_error ("%s: host_get_clock_service () returned %d", __func__, ret);
 }
 
@@ -526,7 +530,11 @@ clock_cleanup (void)
 {
        kern_return_t ret;
 
-       if ((ret = mach_port_deallocate (mach_task_self (), sampling_clock_service)) != KERN_SUCCESS)
+       do {
+               ret = mach_port_deallocate (mach_task_self (), sampling_clock_service);
+       } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS)
                g_error ("%s: mach_port_deallocate () returned %d", __func__, ret);
 }
 
@@ -536,7 +544,11 @@ clock_get_time_ns (void)
        kern_return_t ret;
        mach_timespec_t mach_ts;
 
-       if ((ret = clock_get_time (sampling_clock_service, &mach_ts)) != KERN_SUCCESS)
+       do {
+               ret = clock_get_time (sampling_clock_service, &mach_ts);
+       } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS)
                g_error ("%s: clock_get_time () returned %d", __func__, ret);
 
        return ((guint64) mach_ts.tv_sec * 1000000000) + (guint64) mach_ts.tv_nsec;
@@ -553,10 +565,11 @@ clock_sleep_ns_abs (guint64 ns_abs)
 
        do {
                ret = clock_sleep (sampling_clock_service, TIME_ABSOLUTE, then, &remain_unused);
-
-               if (ret != KERN_SUCCESS && ret != KERN_ABORTED)
-                       g_error ("%s: clock_sleep () returned %d", __func__, ret);
        } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS)
+               g_error ("%s: clock_sleep () returned %d", __func__, ret);
+
 }
 
 #else
index 19e745f630c0c54eb46f030dfb5c3e62e7b7de20..314b30d972c2b18c065f33f6978a050b64bb10f0 100644 (file)
@@ -27,7 +27,7 @@ mono_mach_get_threads (thread_act_array_t *threads, guint32 *count)
 
        do {
                ret = task_threads (current_task (), threads, count);
-       } while (ret != KERN_SUCCESS);
+       } while (ret == KERN_ABORTED);
 
        return ret;
 }
index 7ab1b9df7876d24e2bf88fe0858c177491512bcf..16e3d463b8b7377ab0dfb6ec22b8adcc15b76f66 100644 (file)
@@ -131,10 +131,13 @@ static inline int
 mono_os_sem_post (MonoSemType *sem)
 {
        int res;
-
+retry:
        res = semaphore_signal (*sem);
        g_assert (res != KERN_INVALID_ARGUMENT);
 
+       if (res == KERN_ABORTED)
+               goto retry;
+
        return res != KERN_SUCCESS ? -1 : 0;
 }
 
index c53af96454a79e84a98239ad450d0e9f8658154d..479655189b519be4c51a7b7c274679e8111970f5 100644 (file)
@@ -355,22 +355,35 @@ get_process_stat_item (int pid, int pos, int sum, MonoProcessError *error)
        mach_msg_type_number_t t_info_count = TASK_BASIC_INFO_COUNT, th_count;
        thread_array_t th_array;
        size_t i;
+       kern_return_t ret;
 
        if (pid == getpid ()) {
                /* task_for_pid () doesn't work on ios, even for the current process */
                task = mach_task_self ();
        } else {
-               if (task_for_pid (mach_task_self (), pid, &task) != KERN_SUCCESS)
+               do {
+                       ret = task_for_pid (mach_task_self (), pid, &task);
+               } while (ret == KERN_ABORTED);
+
+               if (ret != KERN_SUCCESS)
                        RET_ERROR (MONO_PROCESS_ERROR_NOT_FOUND);
        }
 
-       if (task_info (task, TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count) != KERN_SUCCESS) {
+       do {
+               ret = task_info (task, TASK_BASIC_INFO, (task_info_t)&t_info, &t_info_count);
+       } while (ret == KERN_ABORTED);
+
+       if (ret != KERN_SUCCESS) {
                if (pid != getpid ())
                        mach_port_deallocate (mach_task_self (), task);
                RET_ERROR (MONO_PROCESS_ERROR_OTHER);
        }
+
+       do {
+               ret = task_threads (task, &th_array, &th_count);
+       } while (ret == KERN_ABORTED);
        
-       if (task_threads(task, &th_array, &th_count) != KERN_SUCCESS) {
+       if (ret  != KERN_SUCCESS) {
                if (pid != getpid ())
                        mach_port_deallocate (mach_task_self (), task);
                RET_ERROR (MONO_PROCESS_ERROR_OTHER);
@@ -381,7 +394,11 @@ get_process_stat_item (int pid, int pos, int sum, MonoProcessError *error)
                
                struct thread_basic_info th_info;
                mach_msg_type_number_t th_info_count = THREAD_BASIC_INFO_COUNT;
-               if (thread_info(th_array[i], THREAD_BASIC_INFO, (thread_info_t)&th_info, &th_info_count) == KERN_SUCCESS) {
+               do {
+                       ret = thread_info(th_array[i], THREAD_BASIC_INFO, (thread_info_t)&th_info, &th_info_count);
+               } while (ret == KERN_ABORTED);
+
+               if (ret == KERN_SUCCESS) {
                        thread_user_time = th_info.user_time.seconds + th_info.user_time.microseconds / 1e6;
                        thread_system_time = th_info.system_time.seconds + th_info.system_time.microseconds / 1e6;
                        //thread_percent = (double)th_info.cpu_usage / TH_USAGE_SCALE;
@@ -494,16 +511,25 @@ get_pid_status_item (int pid, const char *item, MonoProcessError *error, int mul
        task_t task;
        struct task_basic_info t_info;
        mach_msg_type_number_t th_count = TASK_BASIC_INFO_COUNT;
+       kern_return_t mach_ret;
 
        if (pid == getpid ()) {
                /* task_for_pid () doesn't work on ios, even for the current process */
                task = mach_task_self ();
        } else {
-               if (task_for_pid (mach_task_self (), pid, &task) != KERN_SUCCESS)
+               do {
+                       mach_ret = task_for_pid (mach_task_self (), pid, &task);
+               } while (mach_ret == KERN_ABORTED);
+
+               if (mach_ret != KERN_SUCCESS)
                        RET_ERROR (MONO_PROCESS_ERROR_NOT_FOUND);
        }
-       
-       if (task_info (task, TASK_BASIC_INFO, (task_info_t)&t_info, &th_count) != KERN_SUCCESS) {
+
+       do {
+               mach_ret = task_info (task, TASK_BASIC_INFO, (task_info_t)&t_info, &th_count);
+       } while (mach_ret == KERN_ABORTED);
+
+       if (mach_ret != KERN_SUCCESS) {
                if (pid != getpid ())
                        mach_port_deallocate (mach_task_self (), task);
                RET_ERROR (MONO_PROCESS_ERROR_OTHER);
index 056a88f150a1cd72b8b5a1fb9d771ae8495c8845..559d2fa63c94525e4c36384bb02213e3ac3155a4 100644 (file)
@@ -49,11 +49,16 @@ mono_threads_core_abort_syscall (MonoThreadInfo *info)
 {
        kern_return_t ret;
 
-       ret = thread_suspend (info->native_handle);
+       do {
+               ret = thread_suspend (info->native_handle);
+       } while (ret == KERN_ABORTED);
+
        if (ret != KERN_SUCCESS)
                return;
 
-       ret = thread_abort_safely (info->native_handle);
+       do {
+               ret = thread_abort_safely (info->native_handle);
+       } while (ret == KERN_ABORTED);
 
        /*
         * We are doing thread_abort when thread_abort_safely returns KERN_SUCCESS because
@@ -66,7 +71,11 @@ mono_threads_core_abort_syscall (MonoThreadInfo *info)
        if (ret == KERN_SUCCESS)
                ret = thread_abort (info->native_handle);
 
-       g_assert (thread_resume (info->native_handle) == KERN_SUCCESS);
+       do {
+               ret = thread_resume (info->native_handle);
+       } while (ret == KERN_ABORTED);
+
+       g_assert (ret == KERN_SUCCESS);
 }
 
 gboolean
index 6f2ddfa78af52357be210b50a2a43c3a73526b52..6c2b07e468f5686872fb4a9db872be7f30dacdd2 100644 (file)
@@ -60,7 +60,11 @@ mono_threads_core_begin_async_suspend (MonoThreadInfo *info, gboolean interrupt_
 
        g_assert (info);
 
-       ret = thread_suspend (info->native_handle);
+
+       do {
+               ret = thread_suspend (info->native_handle);
+       } while (ret == KERN_ABORTED);
+
        THREADS_SUSPEND_DEBUG ("SUSPEND %p -> %d\n", (void*)info->native_handle, ret);
        if (ret != KERN_SUCCESS)
                return FALSE;
@@ -68,7 +72,10 @@ mono_threads_core_begin_async_suspend (MonoThreadInfo *info, gboolean interrupt_
        /* We're in the middle of a self-suspend, resume and register */
        if (!mono_threads_transition_finish_async_suspend (info)) {
                mono_threads_add_to_pending_operation_set (info);
-               g_assert (thread_resume (info->native_handle) == KERN_SUCCESS);
+               do {
+                       ret = thread_resume (info->native_handle);
+               } while (ret == KERN_ABORTED);
+               g_assert (ret == KERN_SUCCESS);
                THREADS_SUSPEND_DEBUG ("FAILSAFE RESUME/1 %p -> %d\n", (void*)info->native_handle, 0);
                //XXX interrupt_kernel doesn't make sense in this case as the target is not in a syscall
                return TRUE;
@@ -81,7 +88,10 @@ mono_threads_core_begin_async_suspend (MonoThreadInfo *info, gboolean interrupt_
                        thread_abort (info->native_handle);
        } else {
                mono_threads_transition_async_suspend_compensation (info);
-               g_assert (thread_resume (info->native_handle) == KERN_SUCCESS);
+               do {
+                       ret = thread_resume (info->native_handle);
+               } while (ret == KERN_ABORTED);
+               g_assert (ret == KERN_SUCCESS);
                THREADS_SUSPEND_DEBUG ("FAILSAFE RESUME/2 %p -> %d\n", (void*)info->native_handle, 0);
        }
        return res;
@@ -112,7 +122,10 @@ mono_threads_core_begin_async_resume (MonoThreadInfo *info)
                state = (thread_state_t) alloca (mono_mach_arch_get_thread_state_size ());
                mctx = (mcontext_t) alloca (mono_mach_arch_get_mcontext_size ());
 
-               ret = mono_mach_arch_get_thread_state (info->native_handle, state, &num_state);
+               do {
+                       ret = mono_mach_arch_get_thread_state (info->native_handle, state, &num_state);
+               } while (ret == KERN_ABORTED);
+
                if (ret != KERN_SUCCESS)
                        return FALSE;
 
@@ -122,12 +135,17 @@ mono_threads_core_begin_async_resume (MonoThreadInfo *info)
 
                mono_mach_arch_mcontext_to_thread_state (mctx, state);
 
-               ret = mono_mach_arch_set_thread_state (info->native_handle, state, num_state);
+               do {
+                       ret = mono_mach_arch_set_thread_state (info->native_handle, state, num_state);
+               } while (ret == KERN_ABORTED);
+
                if (ret != KERN_SUCCESS)
                        return FALSE;
        }
 
-       ret = thread_resume (info->native_handle);
+       do {
+               ret = thread_resume (info->native_handle);
+       } while (ret == KERN_ABORTED);
        THREADS_SUSPEND_DEBUG ("RESUME %p -> %d\n", (void*)info->native_handle, ret);
 
        return ret == KERN_SUCCESS;